diff --git "a/experiment_config.json" "b/experiment_config.json" new file mode 100644--- /dev/null +++ "b/experiment_config.json" @@ -0,0 +1,231153 @@ +{ + "training_args": { + "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": true, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 8, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 2e-05, + "weight_decay": 0.0, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3, + "max_steps": -1, + "lr_scheduler_type": "linear", + "lr_scheduler_kwargs": {}, + "warmup_ratio": 0.0, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1/runs/Sep05_08-48-54_gx28", + "logging_strategy": "steps", + "logging_first_step": false, + "logging_steps": 20, + "logging_nan_inf_filter": true, + "save_strategy": "epoch", + "save_steps": 40, + "save_total_limit": null, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "seed": 42, + "data_seed": null, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": false, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": [], + "dataloader_drop_last": false, + "eval_steps": 20, + "dataloader_num_workers": 0, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_cosmoqa_answer_generation_lora_v1", + "disable_tqdm": false, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": null, + "greater_is_better": null, + "ignore_data_skip": false, + "fsdp": [], + "fsdp_min_num_params": 0, + "fsdp_config": { + "min_num_params": 0, + "xla": false, + "xla_fsdp_v2": false, + "xla_fsdp_grad_ckpt": false + }, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "split_batches": false, + "dispatch_batches": null, + "even_batches": true, + "use_seedable_sampler": true, + "non_blocking": false, + "gradient_accumulation_kwargs": null + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_token": "", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": false, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": "", + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false + }, + "lora_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "revision": null, + "inference_mode": false, + "r": 16, + "target_modules": [ + "down_proj", + "o_proj", + "v_proj", + "up_proj", + "k_proj", + "gate_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 16, + "lora_dropout": 0.1, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": true, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "runtime_config": { + "ephemeral_gpu_offload": false + }, + "lora_bias": false + }, + "flops": { + "eval": 49705559881469696, + "train": 1.368968336766336e+16, + "total": 6.339524324913306e+16 + }, + "total_energy": 20.575300000000002, + "logs": [ + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:49:23.131069", + "step": 0, + "epoch": 0 + }, + { + "type": "pplx", + "content": 107.54594258685516, + "timestamp": "2025-09-05 08:49:23.133261", + "step": 0, + "epoch": 0 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:23.415457", + "step": 0, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6169126629829407, + "timestamp": "2025-09-05 08:49:23.417206", + "step": 1, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:23.625279", + "step": 1, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7517166137695312, + "timestamp": "2025-09-05 08:49:23.627202", + "step": 2, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:23.802422", + "step": 2, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6235635280609131, + "timestamp": "2025-09-05 08:49:23.804678", + "step": 3, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:23.985359", + "step": 3, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.8837859630584717, + "timestamp": "2025-09-05 08:49:24.194974", + "step": 4, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:24.346141", + "step": 4, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.643877387046814, + "timestamp": "2025-09-05 08:49:24.348074", + "step": 5, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:24.522959", + "step": 5, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7255116105079651, + "timestamp": "2025-09-05 08:49:24.524779", + "step": 6, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:24.695679", + "step": 6, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5929327607154846, + "timestamp": "2025-09-05 08:49:24.697536", + "step": 7, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:24.876445", + "step": 7, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6998869776725769, + "timestamp": "2025-09-05 08:49:24.892849", + "step": 8, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:25.062476", + "step": 8, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7311207056045532, + "timestamp": "2025-09-05 08:49:25.064482", + "step": 9, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:25.235009", + "step": 9, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6596672534942627, + "timestamp": "2025-09-05 08:49:25.238640", + "step": 10, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:25.414055", + "step": 10, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.8627897500991821, + "timestamp": "2025-09-05 08:49:25.416048", + "step": 11, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:25.585764", + "step": 11, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7733902335166931, + "timestamp": "2025-09-05 08:49:25.602455", + "step": 12, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:25.771572", + "step": 12, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6929965615272522, + "timestamp": "2025-09-05 08:49:25.773466", + "step": 13, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:25.950489", + "step": 13, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.622175395488739, + "timestamp": "2025-09-05 08:49:25.952905", + "step": 14, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:26.125306", + "step": 14, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5498591065406799, + "timestamp": "2025-09-05 08:49:26.127215", + "step": 15, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:26.296529", + "step": 15, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5476340651512146, + "timestamp": "2025-09-05 08:49:26.312315", + "step": 16, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:26.474267", + "step": 16, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.592532217502594, + "timestamp": "2025-09-05 08:49:26.476179", + "step": 17, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:26.645322", + "step": 17, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.624656617641449, + "timestamp": "2025-09-05 08:49:26.647578", + "step": 18, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:26.819386", + "step": 18, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6042789816856384, + "timestamp": "2025-09-05 08:49:26.821212", + "step": 19, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:26.992800", + "step": 19, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.54128497838974, + "timestamp": "2025-09-05 08:49:27.006809", + "step": 20, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:49:31.633614", + "step": 20, + "epoch": 1 + }, + { + "type": "pplx", + "content": 98.93025738022824, + "timestamp": "2025-09-05 08:49:31.636504", + "step": 20, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:31.771651", + "step": 20, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6219188570976257, + "timestamp": "2025-09-05 08:49:31.773972", + "step": 21, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:31.945046", + "step": 21, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6864243745803833, + "timestamp": "2025-09-05 08:49:31.947077", + "step": 22, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:32.119341", + "step": 22, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46079257130622864, + "timestamp": "2025-09-05 08:49:32.121160", + "step": 23, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:32.292480", + "step": 23, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6522855162620544, + "timestamp": "2025-09-05 08:49:32.307856", + "step": 24, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:32.472758", + "step": 24, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4554615318775177, + "timestamp": "2025-09-05 08:49:32.474764", + "step": 25, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:32.643264", + "step": 25, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.530107319355011, + "timestamp": "2025-09-05 08:49:32.645124", + "step": 26, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:32.823718", + "step": 26, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5458198189735413, + "timestamp": "2025-09-05 08:49:32.825511", + "step": 27, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:32.995958", + "step": 27, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6185097098350525, + "timestamp": "2025-09-05 08:49:33.012305", + "step": 28, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:33.181153", + "step": 28, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48863735795021057, + "timestamp": "2025-09-05 08:49:33.183739", + "step": 29, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:33.360817", + "step": 29, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4185200035572052, + "timestamp": "2025-09-05 08:49:33.362640", + "step": 30, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:33.532666", + "step": 30, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5862270593643188, + "timestamp": "2025-09-05 08:49:33.534345", + "step": 31, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:33.709825", + "step": 31, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5197166800498962, + "timestamp": "2025-09-05 08:49:33.724694", + "step": 32, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:33.893918", + "step": 32, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4848283529281616, + "timestamp": "2025-09-05 08:49:33.897285", + "step": 33, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:34.070516", + "step": 33, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48005831241607666, + "timestamp": "2025-09-05 08:49:34.072556", + "step": 34, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:34.244353", + "step": 34, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5067130923271179, + "timestamp": "2025-09-05 08:49:34.246260", + "step": 35, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:34.423602", + "step": 35, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5473061800003052, + "timestamp": "2025-09-05 08:49:34.438717", + "step": 36, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:34.608695", + "step": 36, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5381602048873901, + "timestamp": "2025-09-05 08:49:34.610450", + "step": 37, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:34.781445", + "step": 37, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.53970867395401, + "timestamp": "2025-09-05 08:49:34.783221", + "step": 38, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:34.961725", + "step": 38, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5283556580543518, + "timestamp": "2025-09-05 08:49:34.963772", + "step": 39, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:35.140557", + "step": 39, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4740239679813385, + "timestamp": "2025-09-05 08:49:35.154682", + "step": 40, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:49:39.785388", + "step": 40, + "epoch": 1 + }, + { + "type": "pplx", + "content": 87.66840919832546, + "timestamp": "2025-09-05 08:49:39.787493", + "step": 40, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 40", + "timestamp": "2025-09-05 08:49:40.264753", + "step": 40, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:40.437738", + "step": 40, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48637235164642334, + "timestamp": "2025-09-05 08:49:40.439854", + "step": 41, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:40.645274", + "step": 41, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5245602130889893, + "timestamp": "2025-09-05 08:49:40.647013", + "step": 42, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:40.852808", + "step": 42, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4956504702568054, + "timestamp": "2025-09-05 08:49:40.854709", + "step": 43, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:41.052545", + "step": 43, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49369287490844727, + "timestamp": "2025-09-05 08:49:41.069275", + "step": 44, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:41.266843", + "step": 44, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5377260446548462, + "timestamp": "2025-09-05 08:49:41.268933", + "step": 45, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:41.467042", + "step": 45, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3377070426940918, + "timestamp": "2025-09-05 08:49:41.468889", + "step": 46, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:41.666652", + "step": 46, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49804773926734924, + "timestamp": "2025-09-05 08:49:41.668696", + "step": 47, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:41.866243", + "step": 47, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5739774703979492, + "timestamp": "2025-09-05 08:49:41.881595", + "step": 48, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:42.080545", + "step": 48, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44619235396385193, + "timestamp": "2025-09-05 08:49:42.082562", + "step": 49, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:42.287133", + "step": 49, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5007292032241821, + "timestamp": "2025-09-05 08:49:42.289340", + "step": 50, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:42.490217", + "step": 50, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47838714718818665, + "timestamp": "2025-09-05 08:49:42.492212", + "step": 51, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:42.699436", + "step": 51, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5947479605674744, + "timestamp": "2025-09-05 08:49:42.714002", + "step": 52, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:42.904233", + "step": 52, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4078122675418854, + "timestamp": "2025-09-05 08:49:42.906345", + "step": 53, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:43.103998", + "step": 53, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3910931646823883, + "timestamp": "2025-09-05 08:49:43.106013", + "step": 54, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:43.306892", + "step": 54, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5116702318191528, + "timestamp": "2025-09-05 08:49:43.308847", + "step": 55, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:43.506454", + "step": 55, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40538397431373596, + "timestamp": "2025-09-05 08:49:43.523075", + "step": 56, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:43.720676", + "step": 56, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3850765526294708, + "timestamp": "2025-09-05 08:49:43.722821", + "step": 57, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:43.919447", + "step": 57, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5028924345970154, + "timestamp": "2025-09-05 08:49:43.922267", + "step": 58, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:44.120144", + "step": 58, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5674346089363098, + "timestamp": "2025-09-05 08:49:44.122315", + "step": 59, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:44.318793", + "step": 59, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4544109106063843, + "timestamp": "2025-09-05 08:49:44.332801", + "step": 60, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:49:48.947455", + "step": 60, + "epoch": 1 + }, + { + "type": "pplx", + "content": 80.36248908828922, + "timestamp": "2025-09-05 08:49:48.949747", + "step": 60, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:49.112997", + "step": 60, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4171891510486603, + "timestamp": "2025-09-05 08:49:49.115100", + "step": 61, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:49.283044", + "step": 61, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32811057567596436, + "timestamp": "2025-09-05 08:49:49.285337", + "step": 62, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:49.491105", + "step": 62, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5731877088546753, + "timestamp": "2025-09-05 08:49:49.493306", + "step": 63, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:49.692196", + "step": 63, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49480125308036804, + "timestamp": "2025-09-05 08:49:49.708875", + "step": 64, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:49.906046", + "step": 64, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5468859076499939, + "timestamp": "2025-09-05 08:49:49.908063", + "step": 65, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:50.105533", + "step": 65, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40803736448287964, + "timestamp": "2025-09-05 08:49:50.107576", + "step": 66, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:50.303363", + "step": 66, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5299073457717896, + "timestamp": "2025-09-05 08:49:50.306069", + "step": 67, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:50.506081", + "step": 67, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42571789026260376, + "timestamp": "2025-09-05 08:49:50.526032", + "step": 68, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:50.716930", + "step": 68, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4201086461544037, + "timestamp": "2025-09-05 08:49:50.718937", + "step": 69, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:50.916075", + "step": 69, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4884677231311798, + "timestamp": "2025-09-05 08:49:50.918031", + "step": 70, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:51.116971", + "step": 70, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41253289580345154, + "timestamp": "2025-09-05 08:49:51.118932", + "step": 71, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:51.326776", + "step": 71, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45812252163887024, + "timestamp": "2025-09-05 08:49:51.341164", + "step": 72, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:51.538100", + "step": 72, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5628377795219421, + "timestamp": "2025-09-05 08:49:51.540135", + "step": 73, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:49:51.742012", + "step": 73, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5147489905357361, + "timestamp": "2025-09-05 08:49:51.743976", + "step": 74, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:51.950073", + "step": 74, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36594030261039734, + "timestamp": "2025-09-05 08:49:51.952199", + "step": 75, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:52.149922", + "step": 75, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4461878538131714, + "timestamp": "2025-09-05 08:49:52.164170", + "step": 76, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:52.353769", + "step": 76, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4893703758716583, + "timestamp": "2025-09-05 08:49:52.355812", + "step": 77, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:52.552180", + "step": 77, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4122447967529297, + "timestamp": "2025-09-05 08:49:52.554284", + "step": 78, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:52.762974", + "step": 78, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37286263704299927, + "timestamp": "2025-09-05 08:49:52.764953", + "step": 79, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:52.959916", + "step": 79, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32313352823257446, + "timestamp": "2025-09-05 08:49:52.974096", + "step": 80, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:49:57.618632", + "step": 80, + "epoch": 1 + }, + { + "type": "pplx", + "content": 75.94659559691486, + "timestamp": "2025-09-05 08:49:57.620939", + "step": 80, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 80", + "timestamp": "2025-09-05 08:49:58.082937", + "step": 80, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:58.253682", + "step": 80, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4822365939617157, + "timestamp": "2025-09-05 08:49:58.255959", + "step": 81, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:58.453126", + "step": 81, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41527748107910156, + "timestamp": "2025-09-05 08:49:58.455406", + "step": 82, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:49:58.651029", + "step": 82, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49072203040122986, + "timestamp": "2025-09-05 08:49:58.652972", + "step": 83, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:49:58.852717", + "step": 83, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48262521624565125, + "timestamp": "2025-09-05 08:49:58.867136", + "step": 84, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:49:59.058672", + "step": 84, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4185166656970978, + "timestamp": "2025-09-05 08:49:59.060765", + "step": 85, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:59.257019", + "step": 85, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37654638290405273, + "timestamp": "2025-09-05 08:49:59.259103", + "step": 86, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:49:59.468228", + "step": 86, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4784839451313019, + "timestamp": "2025-09-05 08:49:59.470022", + "step": 87, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:49:59.668389", + "step": 87, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4580772817134857, + "timestamp": "2025-09-05 08:49:59.684569", + "step": 88, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:49:59.884885", + "step": 88, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4850596487522125, + "timestamp": "2025-09-05 08:49:59.887173", + "step": 89, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:00.083921", + "step": 89, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4436725974082947, + "timestamp": "2025-09-05 08:50:00.085702", + "step": 90, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:00.292103", + "step": 90, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3864012360572815, + "timestamp": "2025-09-05 08:50:00.294189", + "step": 91, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:00.493198", + "step": 91, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41005319356918335, + "timestamp": "2025-09-05 08:50:00.507541", + "step": 92, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:00.697222", + "step": 92, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29754865169525146, + "timestamp": "2025-09-05 08:50:00.699018", + "step": 93, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:00.895786", + "step": 93, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5100644826889038, + "timestamp": "2025-09-05 08:50:00.897573", + "step": 94, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:50:01.104437", + "step": 94, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5071147680282593, + "timestamp": "2025-09-05 08:50:01.106343", + "step": 95, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:01.304995", + "step": 95, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5573170781135559, + "timestamp": "2025-09-05 08:50:01.319332", + "step": 96, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:01.517000", + "step": 96, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5759362578392029, + "timestamp": "2025-09-05 08:50:01.518928", + "step": 97, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:01.718213", + "step": 97, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44677627086639404, + "timestamp": "2025-09-05 08:50:01.720227", + "step": 98, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:01.927026", + "step": 98, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5631275177001953, + "timestamp": "2025-09-05 08:50:01.928989", + "step": 99, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:02.126438", + "step": 99, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39676016569137573, + "timestamp": "2025-09-05 08:50:02.141475", + "step": 100, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:06.771451", + "step": 100, + "epoch": 1 + }, + { + "type": "pplx", + "content": 73.19032358278096, + "timestamp": "2025-09-05 08:50:06.773213", + "step": 100, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:06.935894", + "step": 100, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34731829166412354, + "timestamp": "2025-09-05 08:50:06.937859", + "step": 101, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:50:07.147849", + "step": 101, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36952462792396545, + "timestamp": "2025-09-05 08:50:07.149839", + "step": 102, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:07.355748", + "step": 102, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4270903468132019, + "timestamp": "2025-09-05 08:50:07.357775", + "step": 103, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:07.555093", + "step": 103, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3678099513053894, + "timestamp": "2025-09-05 08:50:07.569703", + "step": 104, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:07.766139", + "step": 104, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40596431493759155, + "timestamp": "2025-09-05 08:50:07.768238", + "step": 105, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:07.965392", + "step": 105, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44920292496681213, + "timestamp": "2025-09-05 08:50:07.967648", + "step": 106, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:08.174690", + "step": 106, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27072614431381226, + "timestamp": "2025-09-05 08:50:08.176704", + "step": 107, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:08.374070", + "step": 107, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44459718465805054, + "timestamp": "2025-09-05 08:50:08.388419", + "step": 108, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:50:08.578453", + "step": 108, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3547823131084442, + "timestamp": "2025-09-05 08:50:08.580293", + "step": 109, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:08.777417", + "step": 109, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32400763034820557, + "timestamp": "2025-09-05 08:50:08.779441", + "step": 110, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:08.985458", + "step": 110, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3592005968093872, + "timestamp": "2025-09-05 08:50:08.987409", + "step": 111, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:09.193950", + "step": 111, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6158202886581421, + "timestamp": "2025-09-05 08:50:09.208214", + "step": 112, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:09.399264", + "step": 112, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36427071690559387, + "timestamp": "2025-09-05 08:50:09.401514", + "step": 113, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:09.610077", + "step": 113, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28574255108833313, + "timestamp": "2025-09-05 08:50:09.611869", + "step": 114, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:09.810026", + "step": 114, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33345794677734375, + "timestamp": "2025-09-05 08:50:09.812004", + "step": 115, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:10.009667", + "step": 115, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.368472695350647, + "timestamp": "2025-09-05 08:50:10.026099", + "step": 116, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:10.224608", + "step": 116, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3890371322631836, + "timestamp": "2025-09-05 08:50:10.226530", + "step": 117, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:10.433837", + "step": 117, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44138211011886597, + "timestamp": "2025-09-05 08:50:10.435730", + "step": 118, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:10.652742", + "step": 118, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3401064872741699, + "timestamp": "2025-09-05 08:50:10.654683", + "step": 119, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:10.860449", + "step": 119, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35354849696159363, + "timestamp": "2025-09-05 08:50:10.876223", + "step": 120, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:15.516995", + "step": 120, + "epoch": 1 + }, + { + "type": "pplx", + "content": 71.52772055113894, + "timestamp": "2025-09-05 08:50:15.519261", + "step": 120, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 120", + "timestamp": "2025-09-05 08:50:15.977316", + "step": 120, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:16.152308", + "step": 120, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4743736684322357, + "timestamp": "2025-09-05 08:50:16.155092", + "step": 121, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:16.352168", + "step": 121, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36207839846611023, + "timestamp": "2025-09-05 08:50:16.354167", + "step": 122, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:16.554847", + "step": 122, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4461155831813812, + "timestamp": "2025-09-05 08:50:16.556726", + "step": 123, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:16.765987", + "step": 123, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3289283215999603, + "timestamp": "2025-09-05 08:50:16.780395", + "step": 124, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:16.978534", + "step": 124, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39339011907577515, + "timestamp": "2025-09-05 08:50:16.980573", + "step": 125, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:17.179671", + "step": 125, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3922661244869232, + "timestamp": "2025-09-05 08:50:17.181546", + "step": 126, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:17.379811", + "step": 126, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48367026448249817, + "timestamp": "2025-09-05 08:50:17.381618", + "step": 127, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:17.579726", + "step": 127, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3444158434867859, + "timestamp": "2025-09-05 08:50:17.596345", + "step": 128, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:17.793894", + "step": 128, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4369712471961975, + "timestamp": "2025-09-05 08:50:17.795934", + "step": 129, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:17.993686", + "step": 129, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.331216037273407, + "timestamp": "2025-09-05 08:50:17.995933", + "step": 130, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:18.193092", + "step": 130, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3580273389816284, + "timestamp": "2025-09-05 08:50:18.195148", + "step": 131, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:18.392606", + "step": 131, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31754398345947266, + "timestamp": "2025-09-05 08:50:18.407026", + "step": 132, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:50:18.594763", + "step": 132, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5454202890396118, + "timestamp": "2025-09-05 08:50:18.596921", + "step": 133, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:18.794716", + "step": 133, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43953007459640503, + "timestamp": "2025-09-05 08:50:18.796695", + "step": 134, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:18.994641", + "step": 134, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31143492460250854, + "timestamp": "2025-09-05 08:50:18.996835", + "step": 135, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:19.193535", + "step": 135, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48143064975738525, + "timestamp": "2025-09-05 08:50:19.210065", + "step": 136, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:19.406796", + "step": 136, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3588726818561554, + "timestamp": "2025-09-05 08:50:19.408708", + "step": 137, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:19.606770", + "step": 137, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38282686471939087, + "timestamp": "2025-09-05 08:50:19.609583", + "step": 138, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:19.815782", + "step": 138, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42063143849372864, + "timestamp": "2025-09-05 08:50:19.818967", + "step": 139, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:20.017464", + "step": 139, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4207852780818939, + "timestamp": "2025-09-05 08:50:20.031795", + "step": 140, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:24.652104", + "step": 140, + "epoch": 1 + }, + { + "type": "pplx", + "content": 70.65361235350733, + "timestamp": "2025-09-05 08:50:24.654036", + "step": 140, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:24.817137", + "step": 140, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34773746132850647, + "timestamp": "2025-09-05 08:50:24.819048", + "step": 141, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:25.026073", + "step": 141, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4449472725391388, + "timestamp": "2025-09-05 08:50:25.027902", + "step": 142, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:25.224932", + "step": 142, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4693523645401001, + "timestamp": "2025-09-05 08:50:25.231172", + "step": 143, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:50:25.439007", + "step": 143, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3564371168613434, + "timestamp": "2025-09-05 08:50:25.453380", + "step": 144, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:25.650784", + "step": 144, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37880274653434753, + "timestamp": "2025-09-05 08:50:25.652686", + "step": 145, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:25.859559", + "step": 145, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31610339879989624, + "timestamp": "2025-09-05 08:50:25.862056", + "step": 146, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:26.059662", + "step": 146, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46133285760879517, + "timestamp": "2025-09-05 08:50:26.061481", + "step": 147, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:26.268593", + "step": 147, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.413718044757843, + "timestamp": "2025-09-05 08:50:26.283153", + "step": 148, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:26.469459", + "step": 148, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5549363493919373, + "timestamp": "2025-09-05 08:50:26.471379", + "step": 149, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:26.677868", + "step": 149, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29777753353118896, + "timestamp": "2025-09-05 08:50:26.679559", + "step": 150, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:26.875740", + "step": 150, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2828120291233063, + "timestamp": "2025-09-05 08:50:26.878140", + "step": 151, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:27.073797", + "step": 151, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4168826639652252, + "timestamp": "2025-09-05 08:50:27.088398", + "step": 152, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:27.277947", + "step": 152, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35562440752983093, + "timestamp": "2025-09-05 08:50:27.279668", + "step": 153, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:27.475780", + "step": 153, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4816155433654785, + "timestamp": "2025-09-05 08:50:27.477761", + "step": 154, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:27.675934", + "step": 154, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48008760809898376, + "timestamp": "2025-09-05 08:50:27.677685", + "step": 155, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:27.883926", + "step": 155, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42271170020103455, + "timestamp": "2025-09-05 08:50:27.898295", + "step": 156, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:28.088213", + "step": 156, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3952147960662842, + "timestamp": "2025-09-05 08:50:28.090116", + "step": 157, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:28.286788", + "step": 157, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3233911097049713, + "timestamp": "2025-09-05 08:50:28.288735", + "step": 158, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:28.496521", + "step": 158, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49929437041282654, + "timestamp": "2025-09-05 08:50:28.498313", + "step": 159, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:28.704841", + "step": 159, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4313942492008209, + "timestamp": "2025-09-05 08:50:28.719313", + "step": 160, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:33.321295", + "step": 160, + "epoch": 1 + }, + { + "type": "pplx", + "content": 70.65175573066632, + "timestamp": "2025-09-05 08:50:33.323369", + "step": 160, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 160", + "timestamp": "2025-09-05 08:50:33.770272", + "step": 160, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:33.957466", + "step": 160, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34339627623558044, + "timestamp": "2025-09-05 08:50:33.959506", + "step": 161, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:34.164500", + "step": 161, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45955607295036316, + "timestamp": "2025-09-05 08:50:34.166511", + "step": 162, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:50:34.364395", + "step": 162, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4148689806461334, + "timestamp": "2025-09-05 08:50:34.366375", + "step": 163, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:34.572456", + "step": 163, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34146106243133545, + "timestamp": "2025-09-05 08:50:34.588946", + "step": 164, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:34.786564", + "step": 164, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3519960045814514, + "timestamp": "2025-09-05 08:50:34.788408", + "step": 165, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:34.985928", + "step": 165, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2612743675708771, + "timestamp": "2025-09-05 08:50:34.988031", + "step": 166, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:35.186266", + "step": 166, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37086915969848633, + "timestamp": "2025-09-05 08:50:35.188111", + "step": 167, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:35.395045", + "step": 167, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3923819661140442, + "timestamp": "2025-09-05 08:50:35.411448", + "step": 168, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:35.605525", + "step": 168, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5148409605026245, + "timestamp": "2025-09-05 08:50:35.607404", + "step": 169, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:35.803412", + "step": 169, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34032538533210754, + "timestamp": "2025-09-05 08:50:35.805098", + "step": 170, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:36.010702", + "step": 170, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5385991930961609, + "timestamp": "2025-09-05 08:50:36.012528", + "step": 171, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:36.211564", + "step": 171, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3534490466117859, + "timestamp": "2025-09-05 08:50:36.225837", + "step": 172, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:36.419293", + "step": 172, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3421189785003662, + "timestamp": "2025-09-05 08:50:36.421098", + "step": 173, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:36.626841", + "step": 173, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2873148024082184, + "timestamp": "2025-09-05 08:50:36.628694", + "step": 174, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:36.826986", + "step": 174, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28435009717941284, + "timestamp": "2025-09-05 08:50:36.828813", + "step": 175, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:37.025748", + "step": 175, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3528280258178711, + "timestamp": "2025-09-05 08:50:37.039984", + "step": 176, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:37.229084", + "step": 176, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4834965467453003, + "timestamp": "2025-09-05 08:50:37.230935", + "step": 177, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:37.428456", + "step": 177, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33752936124801636, + "timestamp": "2025-09-05 08:50:37.430302", + "step": 178, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:37.636783", + "step": 178, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44458168745040894, + "timestamp": "2025-09-05 08:50:37.638938", + "step": 179, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:37.837233", + "step": 179, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40980052947998047, + "timestamp": "2025-09-05 08:50:37.853984", + "step": 180, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:42.486191", + "step": 180, + "epoch": 1 + }, + { + "type": "pplx", + "content": 69.88217681460745, + "timestamp": "2025-09-05 08:50:42.488185", + "step": 180, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:50:42.649403", + "step": 180, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3106338381767273, + "timestamp": "2025-09-05 08:50:42.653048", + "step": 181, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:42.858542", + "step": 181, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33472582697868347, + "timestamp": "2025-09-05 08:50:42.861054", + "step": 182, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:43.059529", + "step": 182, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5153182148933411, + "timestamp": "2025-09-05 08:50:43.061699", + "step": 183, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:43.259054", + "step": 183, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3759649693965912, + "timestamp": "2025-09-05 08:50:43.275668", + "step": 184, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:43.473055", + "step": 184, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25464075803756714, + "timestamp": "2025-09-05 08:50:43.475222", + "step": 185, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:43.672348", + "step": 185, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46503615379333496, + "timestamp": "2025-09-05 08:50:43.674547", + "step": 186, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:43.870814", + "step": 186, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36886751651763916, + "timestamp": "2025-09-05 08:50:43.872745", + "step": 187, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:44.061475", + "step": 187, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31488415598869324, + "timestamp": "2025-09-05 08:50:44.076163", + "step": 188, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:44.257798", + "step": 188, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41387924551963806, + "timestamp": "2025-09-05 08:50:44.259666", + "step": 189, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:44.463752", + "step": 189, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3407166600227356, + "timestamp": "2025-09-05 08:50:44.465741", + "step": 190, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:44.662388", + "step": 190, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3083406686782837, + "timestamp": "2025-09-05 08:50:44.666316", + "step": 191, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:44.866525", + "step": 191, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46717512607574463, + "timestamp": "2025-09-05 08:50:44.882923", + "step": 192, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:45.079418", + "step": 192, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37655341625213623, + "timestamp": "2025-09-05 08:50:45.082245", + "step": 193, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:45.286188", + "step": 193, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2714085876941681, + "timestamp": "2025-09-05 08:50:45.289071", + "step": 194, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:45.489108", + "step": 194, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44437673687934875, + "timestamp": "2025-09-05 08:50:45.491274", + "step": 195, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:45.693314", + "step": 195, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.525282621383667, + "timestamp": "2025-09-05 08:50:45.708374", + "step": 196, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:45.900556", + "step": 196, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3641051650047302, + "timestamp": "2025-09-05 08:50:45.905337", + "step": 197, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:46.107005", + "step": 197, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.306657075881958, + "timestamp": "2025-09-05 08:50:46.109337", + "step": 198, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:46.315314", + "step": 198, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3821370601654053, + "timestamp": "2025-09-05 08:50:46.318001", + "step": 199, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:46.530150", + "step": 199, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3802856504917145, + "timestamp": "2025-09-05 08:50:46.546252", + "step": 200, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:50:51.304338", + "step": 200, + "epoch": 1 + }, + { + "type": "pplx", + "content": 68.43342716093302, + "timestamp": "2025-09-05 08:50:51.307018", + "step": 200, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 200", + "timestamp": "2025-09-05 08:50:51.794754", + "step": 200, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:51.964613", + "step": 200, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32048162817955017, + "timestamp": "2025-09-05 08:50:51.966794", + "step": 201, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:52.173366", + "step": 201, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3719877600669861, + "timestamp": "2025-09-05 08:50:52.175903", + "step": 202, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:50:52.374299", + "step": 202, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47192662954330444, + "timestamp": "2025-09-05 08:50:52.376239", + "step": 203, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:52.573112", + "step": 203, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35540571808815, + "timestamp": "2025-09-05 08:50:52.587796", + "step": 204, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:52.777809", + "step": 204, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4534974694252014, + "timestamp": "2025-09-05 08:50:52.779680", + "step": 205, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:52.976475", + "step": 205, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2541635036468506, + "timestamp": "2025-09-05 08:50:52.978872", + "step": 206, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:53.186056", + "step": 206, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4390164017677307, + "timestamp": "2025-09-05 08:50:53.187977", + "step": 207, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:53.394871", + "step": 207, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36420944333076477, + "timestamp": "2025-09-05 08:50:53.409950", + "step": 208, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:53.598684", + "step": 208, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3081270754337311, + "timestamp": "2025-09-05 08:50:53.601276", + "step": 209, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:53.807163", + "step": 209, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32181161642074585, + "timestamp": "2025-09-05 08:50:53.809123", + "step": 210, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:54.016227", + "step": 210, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37587931752204895, + "timestamp": "2025-09-05 08:50:54.018183", + "step": 211, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:54.224894", + "step": 211, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3801794648170471, + "timestamp": "2025-09-05 08:50:54.241161", + "step": 212, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:54.438898", + "step": 212, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3296459913253784, + "timestamp": "2025-09-05 08:50:54.440903", + "step": 213, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:54.636463", + "step": 213, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40059760212898254, + "timestamp": "2025-09-05 08:50:54.638462", + "step": 214, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:50:54.845342", + "step": 214, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45545437932014465, + "timestamp": "2025-09-05 08:50:54.847321", + "step": 215, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:50:55.045554", + "step": 215, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5213336944580078, + "timestamp": "2025-09-05 08:50:55.059885", + "step": 216, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:50:55.248331", + "step": 216, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3463953733444214, + "timestamp": "2025-09-05 08:50:55.250555", + "step": 217, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:55.456155", + "step": 217, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36651042103767395, + "timestamp": "2025-09-05 08:50:55.458184", + "step": 218, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:55.664880", + "step": 218, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39168816804885864, + "timestamp": "2025-09-05 08:50:55.666768", + "step": 219, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:50:55.871436", + "step": 219, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43237030506134033, + "timestamp": "2025-09-05 08:50:55.885583", + "step": 220, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:00.528160", + "step": 220, + "epoch": 1 + }, + { + "type": "pplx", + "content": 66.70753911619636, + "timestamp": "2025-09-05 08:51:00.530320", + "step": 220, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:00.694163", + "step": 220, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27549687027931213, + "timestamp": "2025-09-05 08:51:00.696251", + "step": 221, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:00.863090", + "step": 221, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4633401036262512, + "timestamp": "2025-09-05 08:51:00.865224", + "step": 222, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:01.071101", + "step": 222, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.297171413898468, + "timestamp": "2025-09-05 08:51:01.073322", + "step": 223, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:01.281221", + "step": 223, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38710904121398926, + "timestamp": "2025-09-05 08:51:01.298898", + "step": 224, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:01.496439", + "step": 224, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4238070249557495, + "timestamp": "2025-09-05 08:51:01.498489", + "step": 225, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:01.706000", + "step": 225, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4317415952682495, + "timestamp": "2025-09-05 08:51:01.708487", + "step": 226, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:01.908165", + "step": 226, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31439393758773804, + "timestamp": "2025-09-05 08:51:01.910255", + "step": 227, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:02.116936", + "step": 227, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2890167236328125, + "timestamp": "2025-09-05 08:51:02.131642", + "step": 228, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:02.324743", + "step": 228, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4581849277019501, + "timestamp": "2025-09-05 08:51:02.327444", + "step": 229, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:02.525196", + "step": 229, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3810874819755554, + "timestamp": "2025-09-05 08:51:02.527445", + "step": 230, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:02.733657", + "step": 230, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2984119653701782, + "timestamp": "2025-09-05 08:51:02.736785", + "step": 231, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:02.934302", + "step": 231, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37125611305236816, + "timestamp": "2025-09-05 08:51:02.948305", + "step": 232, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:03.144451", + "step": 232, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2847510874271393, + "timestamp": "2025-09-05 08:51:03.146597", + "step": 233, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:03.352972", + "step": 233, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33447766304016113, + "timestamp": "2025-09-05 08:51:03.355044", + "step": 234, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:03.552131", + "step": 234, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32979437708854675, + "timestamp": "2025-09-05 08:51:03.554213", + "step": 235, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:03.751051", + "step": 235, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45558422803878784, + "timestamp": "2025-09-05 08:51:03.764967", + "step": 236, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:03.953115", + "step": 236, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3728318214416504, + "timestamp": "2025-09-05 08:51:03.955116", + "step": 237, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:04.152915", + "step": 237, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31200921535491943, + "timestamp": "2025-09-05 08:51:04.155039", + "step": 238, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:04.360922", + "step": 238, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.548952579498291, + "timestamp": "2025-09-05 08:51:04.362714", + "step": 239, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:04.560346", + "step": 239, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28765401244163513, + "timestamp": "2025-09-05 08:51:04.576314", + "step": 240, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:09.223025", + "step": 240, + "epoch": 1 + }, + { + "type": "pplx", + "content": 65.06928550317252, + "timestamp": "2025-09-05 08:51:09.225006", + "step": 240, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 240", + "timestamp": "2025-09-05 08:51:09.703186", + "step": 240, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:09.874355", + "step": 240, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3755890130996704, + "timestamp": "2025-09-05 08:51:09.876365", + "step": 241, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:10.072449", + "step": 241, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38877207040786743, + "timestamp": "2025-09-05 08:51:10.074440", + "step": 242, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:10.280829", + "step": 242, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49628746509552, + "timestamp": "2025-09-05 08:51:10.282995", + "step": 243, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:10.482218", + "step": 243, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5397868752479553, + "timestamp": "2025-09-05 08:51:10.498957", + "step": 244, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:10.696694", + "step": 244, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3450985848903656, + "timestamp": "2025-09-05 08:51:10.698570", + "step": 245, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:10.893683", + "step": 245, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30554214119911194, + "timestamp": "2025-09-05 08:51:10.895563", + "step": 246, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:11.091433", + "step": 246, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3367531895637512, + "timestamp": "2025-09-05 08:51:11.093469", + "step": 247, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:11.298971", + "step": 247, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33166182041168213, + "timestamp": "2025-09-05 08:51:11.313272", + "step": 248, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:11.503053", + "step": 248, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2959182858467102, + "timestamp": "2025-09-05 08:51:11.505145", + "step": 249, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:11.703129", + "step": 249, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3209533095359802, + "timestamp": "2025-09-05 08:51:11.705161", + "step": 250, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:11.902631", + "step": 250, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3812739849090576, + "timestamp": "2025-09-05 08:51:11.904804", + "step": 251, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:12.103336", + "step": 251, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31804463267326355, + "timestamp": "2025-09-05 08:51:12.119920", + "step": 252, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:12.325726", + "step": 252, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38454669713974, + "timestamp": "2025-09-05 08:51:12.327480", + "step": 253, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:12.533970", + "step": 253, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.357085257768631, + "timestamp": "2025-09-05 08:51:12.535934", + "step": 254, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:12.732659", + "step": 254, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5086879730224609, + "timestamp": "2025-09-05 08:51:12.734686", + "step": 255, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:51:12.942675", + "step": 255, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31578245759010315, + "timestamp": "2025-09-05 08:51:12.957030", + "step": 256, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:13.145896", + "step": 256, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.530243456363678, + "timestamp": "2025-09-05 08:51:13.147719", + "step": 257, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:13.344584", + "step": 257, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3682427406311035, + "timestamp": "2025-09-05 08:51:13.346470", + "step": 258, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:13.543961", + "step": 258, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3959914743900299, + "timestamp": "2025-09-05 08:51:13.545895", + "step": 259, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:13.744614", + "step": 259, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46029427647590637, + "timestamp": "2025-09-05 08:51:13.758973", + "step": 260, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:18.409861", + "step": 260, + "epoch": 1 + }, + { + "type": "pplx", + "content": 64.26154569680897, + "timestamp": "2025-09-05 08:51:18.412481", + "step": 260, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:18.575862", + "step": 260, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4201185405254364, + "timestamp": "2025-09-05 08:51:18.579922", + "step": 261, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:18.749512", + "step": 261, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34828004240989685, + "timestamp": "2025-09-05 08:51:18.751937", + "step": 262, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:18.961324", + "step": 262, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3680271506309509, + "timestamp": "2025-09-05 08:51:18.963334", + "step": 263, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:19.161929", + "step": 263, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.430779367685318, + "timestamp": "2025-09-05 08:51:19.177248", + "step": 264, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:19.370004", + "step": 264, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4136819839477539, + "timestamp": "2025-09-05 08:51:19.372994", + "step": 265, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:19.582552", + "step": 265, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33954256772994995, + "timestamp": "2025-09-05 08:51:19.585434", + "step": 266, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:19.788353", + "step": 266, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3512232005596161, + "timestamp": "2025-09-05 08:51:19.791506", + "step": 267, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:20.000570", + "step": 267, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22923022508621216, + "timestamp": "2025-09-05 08:51:20.016069", + "step": 268, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:20.207996", + "step": 268, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39777079224586487, + "timestamp": "2025-09-05 08:51:20.210365", + "step": 269, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:20.410582", + "step": 269, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34263524413108826, + "timestamp": "2025-09-05 08:51:20.412726", + "step": 270, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:20.611599", + "step": 270, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33798688650131226, + "timestamp": "2025-09-05 08:51:20.616122", + "step": 271, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:20.815141", + "step": 271, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5267437696456909, + "timestamp": "2025-09-05 08:51:20.834848", + "step": 272, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:21.034317", + "step": 272, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41006460785865784, + "timestamp": "2025-09-05 08:51:21.038358", + "step": 273, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:21.244494", + "step": 273, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4595869779586792, + "timestamp": "2025-09-05 08:51:21.247391", + "step": 274, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:21.450319", + "step": 274, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35671982169151306, + "timestamp": "2025-09-05 08:51:21.452813", + "step": 275, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:21.662382", + "step": 275, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3464692234992981, + "timestamp": "2025-09-05 08:51:21.678478", + "step": 276, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:21.873740", + "step": 276, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.513616144657135, + "timestamp": "2025-09-05 08:51:21.876507", + "step": 277, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:22.086577", + "step": 277, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3644759953022003, + "timestamp": "2025-09-05 08:51:22.090146", + "step": 278, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:22.295231", + "step": 278, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30014386773109436, + "timestamp": "2025-09-05 08:51:22.302241", + "step": 279, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:22.516028", + "step": 279, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3263522684574127, + "timestamp": "2025-09-05 08:51:22.532144", + "step": 280, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:27.228302", + "step": 280, + "epoch": 1 + }, + { + "type": "pplx", + "content": 63.39580635831405, + "timestamp": "2025-09-05 08:51:27.231907", + "step": 280, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 280", + "timestamp": "2025-09-05 08:51:27.737466", + "step": 280, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:27.903601", + "step": 280, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2871383726596832, + "timestamp": "2025-09-05 08:51:27.905516", + "step": 281, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:28.113830", + "step": 281, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5244265198707581, + "timestamp": "2025-09-05 08:51:28.116159", + "step": 282, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:28.287939", + "step": 282, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30767932534217834, + "timestamp": "2025-09-05 08:51:28.290430", + "step": 283, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:28.498428", + "step": 283, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42134183645248413, + "timestamp": "2025-09-05 08:51:28.516281", + "step": 284, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:28.708204", + "step": 284, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3752477467060089, + "timestamp": "2025-09-05 08:51:28.710520", + "step": 285, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:51:28.917749", + "step": 285, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3297032415866852, + "timestamp": "2025-09-05 08:51:28.920190", + "step": 286, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:29.118017", + "step": 286, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3052656054496765, + "timestamp": "2025-09-05 08:51:29.121103", + "step": 287, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:29.326871", + "step": 287, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3355713188648224, + "timestamp": "2025-09-05 08:51:29.344871", + "step": 288, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:29.543183", + "step": 288, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35573214292526245, + "timestamp": "2025-09-05 08:51:29.545838", + "step": 289, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:29.753080", + "step": 289, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4543474316596985, + "timestamp": "2025-09-05 08:51:29.756576", + "step": 290, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:29.957195", + "step": 290, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3964625298976898, + "timestamp": "2025-09-05 08:51:29.959974", + "step": 291, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:30.165709", + "step": 291, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3353409469127655, + "timestamp": "2025-09-05 08:51:30.180114", + "step": 292, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:51:30.371786", + "step": 292, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4800986647605896, + "timestamp": "2025-09-05 08:51:30.374286", + "step": 293, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:30.576864", + "step": 293, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3704434633255005, + "timestamp": "2025-09-05 08:51:30.579299", + "step": 294, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:30.780856", + "step": 294, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4373602271080017, + "timestamp": "2025-09-05 08:51:30.784033", + "step": 295, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:30.983714", + "step": 295, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4753832519054413, + "timestamp": "2025-09-05 08:51:30.998392", + "step": 296, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:31.191891", + "step": 296, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4039275646209717, + "timestamp": "2025-09-05 08:51:31.195051", + "step": 297, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:31.409572", + "step": 297, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35360774397850037, + "timestamp": "2025-09-05 08:51:31.411839", + "step": 298, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:31.611994", + "step": 298, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4502357840538025, + "timestamp": "2025-09-05 08:51:31.616480", + "step": 299, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:31.824098", + "step": 299, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34486469626426697, + "timestamp": "2025-09-05 08:51:31.839022", + "step": 300, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:36.541321", + "step": 300, + "epoch": 1 + }, + { + "type": "pplx", + "content": 62.91448786259217, + "timestamp": "2025-09-05 08:51:36.543514", + "step": 300, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:51:36.706204", + "step": 300, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42969754338264465, + "timestamp": "2025-09-05 08:51:36.708169", + "step": 301, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:36.916708", + "step": 301, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3379324972629547, + "timestamp": "2025-09-05 08:51:36.920013", + "step": 302, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:37.118930", + "step": 302, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28274306654930115, + "timestamp": "2025-09-05 08:51:37.121827", + "step": 303, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:37.330465", + "step": 303, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3989158570766449, + "timestamp": "2025-09-05 08:51:37.344698", + "step": 304, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:37.535431", + "step": 304, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45338067412376404, + "timestamp": "2025-09-05 08:51:37.538881", + "step": 305, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:51:37.735743", + "step": 305, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4826226234436035, + "timestamp": "2025-09-05 08:51:37.739435", + "step": 306, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:37.937478", + "step": 306, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23507355153560638, + "timestamp": "2025-09-05 08:51:37.939847", + "step": 307, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:38.148933", + "step": 307, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41786983609199524, + "timestamp": "2025-09-05 08:51:38.165846", + "step": 308, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:38.364288", + "step": 308, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3461589813232422, + "timestamp": "2025-09-05 08:51:38.366570", + "step": 309, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:38.576965", + "step": 309, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4844832420349121, + "timestamp": "2025-09-05 08:51:38.579372", + "step": 310, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:38.787986", + "step": 310, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38971275091171265, + "timestamp": "2025-09-05 08:51:38.790638", + "step": 311, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:51:38.997464", + "step": 311, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.474933385848999, + "timestamp": "2025-09-05 08:51:39.013582", + "step": 312, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:39.206540", + "step": 312, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37481558322906494, + "timestamp": "2025-09-05 08:51:39.210174", + "step": 313, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:39.410262", + "step": 313, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3350926637649536, + "timestamp": "2025-09-05 08:51:39.412546", + "step": 314, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:39.622336", + "step": 314, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5339053869247437, + "timestamp": "2025-09-05 08:51:39.624825", + "step": 315, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:39.836689", + "step": 315, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3207130432128906, + "timestamp": "2025-09-05 08:51:39.851434", + "step": 316, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:40.045517", + "step": 316, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35091298818588257, + "timestamp": "2025-09-05 08:51:40.047800", + "step": 317, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:40.248682", + "step": 317, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3151177763938904, + "timestamp": "2025-09-05 08:51:40.251910", + "step": 318, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:40.448949", + "step": 318, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2805836796760559, + "timestamp": "2025-09-05 08:51:40.453248", + "step": 319, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:40.664741", + "step": 319, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31773483753204346, + "timestamp": "2025-09-05 08:51:40.679760", + "step": 320, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:45.519892", + "step": 320, + "epoch": 1 + }, + { + "type": "pplx", + "content": 62.03072212739932, + "timestamp": "2025-09-05 08:51:45.522026", + "step": 320, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 320", + "timestamp": "2025-09-05 08:51:45.987925", + "step": 320, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:46.160525", + "step": 320, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.287524551153183, + "timestamp": "2025-09-05 08:51:46.164165", + "step": 321, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:46.367278", + "step": 321, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.356285959482193, + "timestamp": "2025-09-05 08:51:46.369491", + "step": 322, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:46.578658", + "step": 322, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5080327987670898, + "timestamp": "2025-09-05 08:51:46.581762", + "step": 323, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:46.786024", + "step": 323, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3835434019565582, + "timestamp": "2025-09-05 08:51:46.803007", + "step": 324, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:47.003397", + "step": 324, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3160017430782318, + "timestamp": "2025-09-05 08:51:47.006010", + "step": 325, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:47.207660", + "step": 325, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4292439818382263, + "timestamp": "2025-09-05 08:51:47.210036", + "step": 326, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:51:47.416253", + "step": 326, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5584763884544373, + "timestamp": "2025-09-05 08:51:47.418687", + "step": 327, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:47.628242", + "step": 327, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2741956412792206, + "timestamp": "2025-09-05 08:51:47.645050", + "step": 328, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:47.846435", + "step": 328, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45366376638412476, + "timestamp": "2025-09-05 08:51:47.849132", + "step": 329, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:48.049050", + "step": 329, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4707634449005127, + "timestamp": "2025-09-05 08:51:48.051331", + "step": 330, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:48.252308", + "step": 330, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32083678245544434, + "timestamp": "2025-09-05 08:51:48.254709", + "step": 331, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:48.453175", + "step": 331, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23594726622104645, + "timestamp": "2025-09-05 08:51:48.469797", + "step": 332, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:48.670973", + "step": 332, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.342578649520874, + "timestamp": "2025-09-05 08:51:48.673988", + "step": 333, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:48.884254", + "step": 333, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3945893943309784, + "timestamp": "2025-09-05 08:51:48.886685", + "step": 334, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:49.086662", + "step": 334, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2937992811203003, + "timestamp": "2025-09-05 08:51:49.090876", + "step": 335, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:49.288803", + "step": 335, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3234483003616333, + "timestamp": "2025-09-05 08:51:49.305690", + "step": 336, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:49.506331", + "step": 336, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3142837584018707, + "timestamp": "2025-09-05 08:51:49.508588", + "step": 337, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:51:49.706321", + "step": 337, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29564931988716125, + "timestamp": "2025-09-05 08:51:49.712194", + "step": 338, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:49.919033", + "step": 338, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2818536162376404, + "timestamp": "2025-09-05 08:51:49.921396", + "step": 339, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:50.129125", + "step": 339, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31897684931755066, + "timestamp": "2025-09-05 08:51:50.145532", + "step": 340, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:51:54.884325", + "step": 340, + "epoch": 1 + }, + { + "type": "pplx", + "content": 61.25419831481063, + "timestamp": "2025-09-05 08:51:54.886751", + "step": 340, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:55.051072", + "step": 340, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3685891926288605, + "timestamp": "2025-09-05 08:51:55.054219", + "step": 341, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:55.265043", + "step": 341, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4979967176914215, + "timestamp": "2025-09-05 08:51:55.267081", + "step": 342, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:55.467739", + "step": 342, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4489175081253052, + "timestamp": "2025-09-05 08:51:55.470109", + "step": 343, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:55.671335", + "step": 343, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42277976870536804, + "timestamp": "2025-09-05 08:51:55.686218", + "step": 344, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:55.877423", + "step": 344, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26805245876312256, + "timestamp": "2025-09-05 08:51:55.881086", + "step": 345, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:51:56.083943", + "step": 345, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4023347496986389, + "timestamp": "2025-09-05 08:51:56.086884", + "step": 346, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:56.285593", + "step": 346, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21793437004089355, + "timestamp": "2025-09-05 08:51:56.287738", + "step": 347, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:56.497434", + "step": 347, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42367270588874817, + "timestamp": "2025-09-05 08:51:56.514045", + "step": 348, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:56.714891", + "step": 348, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19184860587120056, + "timestamp": "2025-09-05 08:51:56.718750", + "step": 349, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:56.916934", + "step": 349, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4862029254436493, + "timestamp": "2025-09-05 08:51:56.919071", + "step": 350, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:57.119662", + "step": 350, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39638984203338623, + "timestamp": "2025-09-05 08:51:57.123211", + "step": 351, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:57.321992", + "step": 351, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44743719696998596, + "timestamp": "2025-09-05 08:51:57.337160", + "step": 352, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:57.531589", + "step": 352, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3436514139175415, + "timestamp": "2025-09-05 08:51:57.533859", + "step": 353, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:57.733979", + "step": 353, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5128458142280579, + "timestamp": "2025-09-05 08:51:57.736308", + "step": 354, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:57.936356", + "step": 354, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38132208585739136, + "timestamp": "2025-09-05 08:51:57.938784", + "step": 355, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:58.151174", + "step": 355, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4043236970901489, + "timestamp": "2025-09-05 08:51:58.166887", + "step": 356, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:51:58.368726", + "step": 356, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4835319221019745, + "timestamp": "2025-09-05 08:51:58.371395", + "step": 357, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:51:58.579671", + "step": 357, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4353181719779968, + "timestamp": "2025-09-05 08:51:58.581758", + "step": 358, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:51:58.792551", + "step": 358, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.532089352607727, + "timestamp": "2025-09-05 08:51:58.795005", + "step": 359, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:51:59.005467", + "step": 359, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36872991919517517, + "timestamp": "2025-09-05 08:51:59.022067", + "step": 360, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:03.741679", + "step": 360, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.85764402253067, + "timestamp": "2025-09-05 08:52:03.744697", + "step": 360, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 360", + "timestamp": "2025-09-05 08:52:04.267463", + "step": 360, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:04.467076", + "step": 360, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4337652027606964, + "timestamp": "2025-09-05 08:52:04.469790", + "step": 361, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:04.669343", + "step": 361, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3073671758174896, + "timestamp": "2025-09-05 08:52:04.672067", + "step": 362, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:04.870495", + "step": 362, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3424108922481537, + "timestamp": "2025-09-05 08:52:04.872776", + "step": 363, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:05.080213", + "step": 363, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4130728542804718, + "timestamp": "2025-09-05 08:52:05.097032", + "step": 364, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:05.296805", + "step": 364, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5122348666191101, + "timestamp": "2025-09-05 08:52:05.299866", + "step": 365, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:05.512098", + "step": 365, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35365021228790283, + "timestamp": "2025-09-05 08:52:05.514520", + "step": 366, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:05.725698", + "step": 366, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3705803155899048, + "timestamp": "2025-09-05 08:52:05.728166", + "step": 367, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:05.927508", + "step": 367, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31572356820106506, + "timestamp": "2025-09-05 08:52:05.943003", + "step": 368, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:06.136877", + "step": 368, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.484190434217453, + "timestamp": "2025-09-05 08:52:06.139123", + "step": 369, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:06.338365", + "step": 369, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33856910467147827, + "timestamp": "2025-09-05 08:52:06.340412", + "step": 370, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:52:06.548382", + "step": 370, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33974528312683105, + "timestamp": "2025-09-05 08:52:06.551882", + "step": 371, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:06.725021", + "step": 371, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28406357765197754, + "timestamp": "2025-09-05 08:52:06.734842", + "step": 372, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:06.902028", + "step": 372, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4505447447299957, + "timestamp": "2025-09-05 08:52:06.904266", + "step": 373, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:07.072962", + "step": 373, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41192686557769775, + "timestamp": "2025-09-05 08:52:07.075418", + "step": 374, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:07.259228", + "step": 374, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48100942373275757, + "timestamp": "2025-09-05 08:52:07.261588", + "step": 375, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:07.442104", + "step": 375, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4199207127094269, + "timestamp": "2025-09-05 08:52:07.451785", + "step": 376, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:07.618469", + "step": 376, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44891834259033203, + "timestamp": "2025-09-05 08:52:07.620478", + "step": 377, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:07.794549", + "step": 377, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38180747628211975, + "timestamp": "2025-09-05 08:52:07.796896", + "step": 378, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:07.967300", + "step": 378, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4493151307106018, + "timestamp": "2025-09-05 08:52:07.970670", + "step": 379, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:08.151498", + "step": 379, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3687281906604767, + "timestamp": "2025-09-05 08:52:08.161079", + "step": 380, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:12.875645", + "step": 380, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.50173690886968, + "timestamp": "2025-09-05 08:52:12.879267", + "step": 380, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:13.045039", + "step": 380, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24862390756607056, + "timestamp": "2025-09-05 08:52:13.050670", + "step": 381, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:13.251042", + "step": 381, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3486191928386688, + "timestamp": "2025-09-05 08:52:13.253890", + "step": 382, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:13.465124", + "step": 382, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3658300042152405, + "timestamp": "2025-09-05 08:52:13.467841", + "step": 383, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:13.675325", + "step": 383, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31089478731155396, + "timestamp": "2025-09-05 08:52:13.693652", + "step": 384, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:13.879125", + "step": 384, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3752908408641815, + "timestamp": "2025-09-05 08:52:13.881370", + "step": 385, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:14.090355", + "step": 385, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5100250244140625, + "timestamp": "2025-09-05 08:52:14.092533", + "step": 386, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:14.301671", + "step": 386, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5159087181091309, + "timestamp": "2025-09-05 08:52:14.307073", + "step": 387, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:14.505032", + "step": 387, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4958210289478302, + "timestamp": "2025-09-05 08:52:14.527608", + "step": 388, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:14.729709", + "step": 388, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47933509945869446, + "timestamp": "2025-09-05 08:52:14.732227", + "step": 389, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:14.931821", + "step": 389, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4100204110145569, + "timestamp": "2025-09-05 08:52:14.936678", + "step": 390, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:15.148605", + "step": 390, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4352912902832031, + "timestamp": "2025-09-05 08:52:15.151091", + "step": 391, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:15.349293", + "step": 391, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39970317482948303, + "timestamp": "2025-09-05 08:52:15.365860", + "step": 392, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:15.567312", + "step": 392, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3486652076244354, + "timestamp": "2025-09-05 08:52:15.569587", + "step": 393, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:15.779892", + "step": 393, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4482623040676117, + "timestamp": "2025-09-05 08:52:15.781888", + "step": 394, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:52:15.955997", + "step": 394, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3497057557106018, + "timestamp": "2025-09-05 08:52:15.959493", + "step": 395, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:16.167562", + "step": 395, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2155306488275528, + "timestamp": "2025-09-05 08:52:16.181975", + "step": 396, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:16.374926", + "step": 396, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2984706163406372, + "timestamp": "2025-09-05 08:52:16.377026", + "step": 397, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:16.578129", + "step": 397, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4520842134952545, + "timestamp": "2025-09-05 08:52:16.580222", + "step": 398, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:16.787897", + "step": 398, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4184480905532837, + "timestamp": "2025-09-05 08:52:16.791025", + "step": 399, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:17.000795", + "step": 399, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.391579806804657, + "timestamp": "2025-09-05 08:52:17.016163", + "step": 400, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:21.710130", + "step": 400, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.69678279140599, + "timestamp": "2025-09-05 08:52:21.712252", + "step": 400, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 400", + "timestamp": "2025-09-05 08:52:22.212048", + "step": 400, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:22.382310", + "step": 400, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5822342038154602, + "timestamp": "2025-09-05 08:52:22.384599", + "step": 401, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:22.583471", + "step": 401, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26364198327064514, + "timestamp": "2025-09-05 08:52:22.585575", + "step": 402, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:22.786247", + "step": 402, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.415432333946228, + "timestamp": "2025-09-05 08:52:22.788568", + "step": 403, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:22.998089", + "step": 403, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43436604738235474, + "timestamp": "2025-09-05 08:52:23.014811", + "step": 404, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:23.218248", + "step": 404, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33526358008384705, + "timestamp": "2025-09-05 08:52:23.220827", + "step": 405, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:52:23.419773", + "step": 405, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3843635022640228, + "timestamp": "2025-09-05 08:52:23.423685", + "step": 406, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:23.622251", + "step": 406, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3481503427028656, + "timestamp": "2025-09-05 08:52:23.625075", + "step": 407, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:23.833551", + "step": 407, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.400000661611557, + "timestamp": "2025-09-05 08:52:23.847684", + "step": 408, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:24.040538", + "step": 408, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3273821473121643, + "timestamp": "2025-09-05 08:52:24.042628", + "step": 409, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:24.251973", + "step": 409, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29011961817741394, + "timestamp": "2025-09-05 08:52:24.254121", + "step": 410, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:24.454853", + "step": 410, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4190179407596588, + "timestamp": "2025-09-05 08:52:24.457738", + "step": 411, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:24.655875", + "step": 411, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3011251389980316, + "timestamp": "2025-09-05 08:52:24.670472", + "step": 412, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:24.865142", + "step": 412, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34651055932044983, + "timestamp": "2025-09-05 08:52:24.869347", + "step": 413, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:25.068331", + "step": 413, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4374030530452728, + "timestamp": "2025-09-05 08:52:25.071673", + "step": 414, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:25.275075", + "step": 414, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3524605929851532, + "timestamp": "2025-09-05 08:52:25.279079", + "step": 415, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:25.476558", + "step": 415, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36618897318840027, + "timestamp": "2025-09-05 08:52:25.491665", + "step": 416, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:52:25.683339", + "step": 416, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3304573893547058, + "timestamp": "2025-09-05 08:52:25.685271", + "step": 417, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:25.893443", + "step": 417, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2957325577735901, + "timestamp": "2025-09-05 08:52:25.895735", + "step": 418, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:26.104484", + "step": 418, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3814864158630371, + "timestamp": "2025-09-05 08:52:26.107123", + "step": 419, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:26.312068", + "step": 419, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3151036202907562, + "timestamp": "2025-09-05 08:52:26.328851", + "step": 420, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:31.036153", + "step": 420, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.35530192163852, + "timestamp": "2025-09-05 08:52:31.038978", + "step": 420, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:31.210366", + "step": 420, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3436422348022461, + "timestamp": "2025-09-05 08:52:31.216851", + "step": 421, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:31.427152", + "step": 421, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24939358234405518, + "timestamp": "2025-09-05 08:52:31.429321", + "step": 422, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:31.603677", + "step": 422, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2410716712474823, + "timestamp": "2025-09-05 08:52:31.605963", + "step": 423, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:31.775227", + "step": 423, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4125831723213196, + "timestamp": "2025-09-05 08:52:31.792232", + "step": 424, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:31.995860", + "step": 424, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3474792242050171, + "timestamp": "2025-09-05 08:52:31.998205", + "step": 425, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:32.198239", + "step": 425, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31995099782943726, + "timestamp": "2025-09-05 08:52:32.200884", + "step": 426, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:32.413494", + "step": 426, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3391028046607971, + "timestamp": "2025-09-05 08:52:32.416831", + "step": 427, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:52:32.626920", + "step": 427, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38398104906082153, + "timestamp": "2025-09-05 08:52:32.643118", + "step": 428, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:32.839796", + "step": 428, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4830264151096344, + "timestamp": "2025-09-05 08:52:32.843610", + "step": 429, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:33.096484", + "step": 429, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38072633743286133, + "timestamp": "2025-09-05 08:52:33.098709", + "step": 430, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:33.350670", + "step": 430, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35430270433425903, + "timestamp": "2025-09-05 08:52:33.393391", + "step": 431, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:33.603830", + "step": 431, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33442458510398865, + "timestamp": "2025-09-05 08:52:33.618554", + "step": 432, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:33.810928", + "step": 432, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48529914021492004, + "timestamp": "2025-09-05 08:52:33.814095", + "step": 433, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:34.016352", + "step": 433, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.414386510848999, + "timestamp": "2025-09-05 08:52:34.018901", + "step": 434, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:34.228728", + "step": 434, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5347086191177368, + "timestamp": "2025-09-05 08:52:34.231277", + "step": 435, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:34.430951", + "step": 435, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3398420214653015, + "timestamp": "2025-09-05 08:52:34.446031", + "step": 436, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:34.681402", + "step": 436, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33175885677337646, + "timestamp": "2025-09-05 08:52:34.683884", + "step": 437, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:34.891436", + "step": 437, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30734267830848694, + "timestamp": "2025-09-05 08:52:34.894720", + "step": 438, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:35.093238", + "step": 438, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2820775508880615, + "timestamp": "2025-09-05 08:52:35.096035", + "step": 439, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:35.389076", + "step": 439, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3953246772289276, + "timestamp": "2025-09-05 08:52:35.405738", + "step": 440, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:40.619422", + "step": 440, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.06938762912424, + "timestamp": "2025-09-05 08:52:40.622034", + "step": 440, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 440", + "timestamp": "2025-09-05 08:52:41.124812", + "step": 440, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:41.337105", + "step": 440, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.428076833486557, + "timestamp": "2025-09-05 08:52:41.362204", + "step": 441, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:41.616790", + "step": 441, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30740123987197876, + "timestamp": "2025-09-05 08:52:41.619651", + "step": 442, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:41.829409", + "step": 442, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3571283221244812, + "timestamp": "2025-09-05 08:52:41.831738", + "step": 443, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:42.030375", + "step": 443, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3677189350128174, + "timestamp": "2025-09-05 08:52:42.044872", + "step": 444, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:42.237303", + "step": 444, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2622219920158386, + "timestamp": "2025-09-05 08:52:42.240974", + "step": 445, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:42.449712", + "step": 445, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3243364989757538, + "timestamp": "2025-09-05 08:52:42.452727", + "step": 446, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:52:42.704565", + "step": 446, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2399972379207611, + "timestamp": "2025-09-05 08:52:42.706805", + "step": 447, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:42.906947", + "step": 447, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37382861971855164, + "timestamp": "2025-09-05 08:52:42.921140", + "step": 448, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:43.115262", + "step": 448, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42507728934288025, + "timestamp": "2025-09-05 08:52:43.118670", + "step": 449, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:52:43.370405", + "step": 449, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30817678570747375, + "timestamp": "2025-09-05 08:52:43.373234", + "step": 450, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:43.575061", + "step": 450, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3689974248409271, + "timestamp": "2025-09-05 08:52:43.581882", + "step": 451, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:43.786456", + "step": 451, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3893755078315735, + "timestamp": "2025-09-05 08:52:43.803301", + "step": 452, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:44.068521", + "step": 452, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4274936020374298, + "timestamp": "2025-09-05 08:52:44.071142", + "step": 453, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:44.279260", + "step": 453, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24627014994621277, + "timestamp": "2025-09-05 08:52:44.281140", + "step": 454, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:44.488500", + "step": 454, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42970582842826843, + "timestamp": "2025-09-05 08:52:44.495746", + "step": 455, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:44.698466", + "step": 455, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4055229127407074, + "timestamp": "2025-09-05 08:52:44.716271", + "step": 456, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:44.910316", + "step": 456, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3416372537612915, + "timestamp": "2025-09-05 08:52:44.927124", + "step": 457, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:52:45.181071", + "step": 457, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2750747799873352, + "timestamp": "2025-09-05 08:52:45.183843", + "step": 458, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:45.394187", + "step": 458, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3312303125858307, + "timestamp": "2025-09-05 08:52:45.396553", + "step": 459, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:52:45.594916", + "step": 459, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4751344919204712, + "timestamp": "2025-09-05 08:52:45.612023", + "step": 460, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:52:51.067022", + "step": 460, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.67811926915719, + "timestamp": "2025-09-05 08:52:51.069230", + "step": 460, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:51.234053", + "step": 460, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3601231575012207, + "timestamp": "2025-09-05 08:52:51.236726", + "step": 461, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:51.451554", + "step": 461, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3900015354156494, + "timestamp": "2025-09-05 08:52:51.459374", + "step": 462, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:51.668075", + "step": 462, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.355674684047699, + "timestamp": "2025-09-05 08:52:51.676778", + "step": 463, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:51.889242", + "step": 463, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30925506353378296, + "timestamp": "2025-09-05 08:52:51.913685", + "step": 464, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:52.111744", + "step": 464, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41809117794036865, + "timestamp": "2025-09-05 08:52:52.114550", + "step": 465, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:52.335190", + "step": 465, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31774064898490906, + "timestamp": "2025-09-05 08:52:52.339915", + "step": 466, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:52.548704", + "step": 466, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22886255383491516, + "timestamp": "2025-09-05 08:52:52.593754", + "step": 467, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:52.803549", + "step": 467, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3445352017879486, + "timestamp": "2025-09-05 08:52:52.822863", + "step": 468, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:53.018136", + "step": 468, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41649383306503296, + "timestamp": "2025-09-05 08:52:53.020410", + "step": 469, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:53.225503", + "step": 469, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5665808320045471, + "timestamp": "2025-09-05 08:52:53.228809", + "step": 470, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:53.431781", + "step": 470, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4123092293739319, + "timestamp": "2025-09-05 08:52:53.433893", + "step": 471, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:53.636862", + "step": 471, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4420497417449951, + "timestamp": "2025-09-05 08:52:53.650920", + "step": 472, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:53.845865", + "step": 472, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35629701614379883, + "timestamp": "2025-09-05 08:52:53.850369", + "step": 473, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:54.163112", + "step": 473, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3124181926250458, + "timestamp": "2025-09-05 08:52:54.206389", + "step": 474, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:52:54.461222", + "step": 474, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5296250581741333, + "timestamp": "2025-09-05 08:52:54.463422", + "step": 475, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:52:54.680325", + "step": 475, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42064934968948364, + "timestamp": "2025-09-05 08:52:54.706467", + "step": 476, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:54.892954", + "step": 476, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37133467197418213, + "timestamp": "2025-09-05 08:52:54.898729", + "step": 477, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:52:55.097691", + "step": 477, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.311069518327713, + "timestamp": "2025-09-05 08:52:55.100288", + "step": 478, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:52:55.397284", + "step": 478, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.174685537815094, + "timestamp": "2025-09-05 08:52:55.401764", + "step": 479, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:52:55.657892", + "step": 479, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4376969337463379, + "timestamp": "2025-09-05 08:52:55.674695", + "step": 480, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:01.067886", + "step": 480, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.60817761059333, + "timestamp": "2025-09-05 08:53:01.070163", + "step": 480, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 480", + "timestamp": "2025-09-05 08:53:01.614052", + "step": 480, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:01.877643", + "step": 480, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3756929636001587, + "timestamp": "2025-09-05 08:53:01.883127", + "step": 481, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:02.085810", + "step": 481, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39133283495903015, + "timestamp": "2025-09-05 08:53:02.088041", + "step": 482, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:02.296584", + "step": 482, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2793099284172058, + "timestamp": "2025-09-05 08:53:02.303765", + "step": 483, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:02.505169", + "step": 483, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3743014335632324, + "timestamp": "2025-09-05 08:53:02.519161", + "step": 484, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:02.754265", + "step": 484, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3891092538833618, + "timestamp": "2025-09-05 08:53:02.798142", + "step": 485, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:03.003319", + "step": 485, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.377437561750412, + "timestamp": "2025-09-05 08:53:03.005920", + "step": 486, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:03.218782", + "step": 486, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2652597725391388, + "timestamp": "2025-09-05 08:53:03.223824", + "step": 487, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:03.435831", + "step": 487, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42560693621635437, + "timestamp": "2025-09-05 08:53:03.453133", + "step": 488, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:03.677906", + "step": 488, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42667150497436523, + "timestamp": "2025-09-05 08:53:03.681745", + "step": 489, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:03.887673", + "step": 489, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21720796823501587, + "timestamp": "2025-09-05 08:53:03.890582", + "step": 490, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:04.088914", + "step": 490, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2846270203590393, + "timestamp": "2025-09-05 08:53:04.090855", + "step": 491, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:04.289239", + "step": 491, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.420718789100647, + "timestamp": "2025-09-05 08:53:04.306191", + "step": 492, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:04.504965", + "step": 492, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4062350392341614, + "timestamp": "2025-09-05 08:53:04.520504", + "step": 493, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:04.812004", + "step": 493, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36269500851631165, + "timestamp": "2025-09-05 08:53:04.822831", + "step": 494, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:05.030379", + "step": 494, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40626439452171326, + "timestamp": "2025-09-05 08:53:05.032308", + "step": 495, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:05.239160", + "step": 495, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39664778113365173, + "timestamp": "2025-09-05 08:53:05.257545", + "step": 496, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:05.455355", + "step": 496, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4610913395881653, + "timestamp": "2025-09-05 08:53:05.459367", + "step": 497, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:53:05.670320", + "step": 497, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3041105568408966, + "timestamp": "2025-09-05 08:53:05.675017", + "step": 498, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:05.885044", + "step": 498, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36558741331100464, + "timestamp": "2025-09-05 08:53:05.887386", + "step": 499, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:06.094042", + "step": 499, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29611101746559143, + "timestamp": "2025-09-05 08:53:06.108327", + "step": 500, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:10.952066", + "step": 500, + "epoch": 1 + }, + { + "type": "pplx", + "content": 61.06419594850436, + "timestamp": "2025-09-05 08:53:10.955248", + "step": 500, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:11.116573", + "step": 500, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3699820637702942, + "timestamp": "2025-09-05 08:53:11.120663", + "step": 501, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:11.289895", + "step": 501, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3068196177482605, + "timestamp": "2025-09-05 08:53:11.292064", + "step": 502, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:11.497515", + "step": 502, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.401764839887619, + "timestamp": "2025-09-05 08:53:11.499413", + "step": 503, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:11.694902", + "step": 503, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3740246295928955, + "timestamp": "2025-09-05 08:53:11.710379", + "step": 504, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:11.902108", + "step": 504, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3317362368106842, + "timestamp": "2025-09-05 08:53:11.904239", + "step": 505, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:12.110463", + "step": 505, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32889097929000854, + "timestamp": "2025-09-05 08:53:12.112806", + "step": 506, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:12.314073", + "step": 506, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30558574199676514, + "timestamp": "2025-09-05 08:53:12.316155", + "step": 507, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:12.524019", + "step": 507, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4227554500102997, + "timestamp": "2025-09-05 08:53:12.541949", + "step": 508, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:12.742901", + "step": 508, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31141963601112366, + "timestamp": "2025-09-05 08:53:12.745837", + "step": 509, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:12.951102", + "step": 509, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3356727659702301, + "timestamp": "2025-09-05 08:53:12.953827", + "step": 510, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:13.155485", + "step": 510, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27994629740715027, + "timestamp": "2025-09-05 08:53:13.158290", + "step": 511, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:13.365905", + "step": 511, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40879178047180176, + "timestamp": "2025-09-05 08:53:13.382155", + "step": 512, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:13.596253", + "step": 512, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42021042108535767, + "timestamp": "2025-09-05 08:53:13.598287", + "step": 513, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:13.810567", + "step": 513, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47443243861198425, + "timestamp": "2025-09-05 08:53:13.814212", + "step": 514, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:14.020899", + "step": 514, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47561585903167725, + "timestamp": "2025-09-05 08:53:14.024098", + "step": 515, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:14.276776", + "step": 515, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3374590277671814, + "timestamp": "2025-09-05 08:53:14.291744", + "step": 516, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:14.484079", + "step": 516, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3174070715904236, + "timestamp": "2025-09-05 08:53:14.487904", + "step": 517, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:14.688330", + "step": 517, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5130405426025391, + "timestamp": "2025-09-05 08:53:14.705020", + "step": 518, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:14.916635", + "step": 518, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44266101717948914, + "timestamp": "2025-09-05 08:53:14.921476", + "step": 519, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:15.124075", + "step": 519, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.307713121175766, + "timestamp": "2025-09-05 08:53:15.140877", + "step": 520, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:20.516080", + "step": 520, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.16279337842755, + "timestamp": "2025-09-05 08:53:20.518402", + "step": 520, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 520", + "timestamp": "2025-09-05 08:53:21.039725", + "step": 520, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:21.209774", + "step": 520, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4752398729324341, + "timestamp": "2025-09-05 08:53:21.211928", + "step": 521, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:21.467367", + "step": 521, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3615381419658661, + "timestamp": "2025-09-05 08:53:21.469896", + "step": 522, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:21.740291", + "step": 522, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3906700313091278, + "timestamp": "2025-09-05 08:53:21.742462", + "step": 523, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:21.955790", + "step": 523, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40926316380500793, + "timestamp": "2025-09-05 08:53:21.972562", + "step": 524, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:22.163717", + "step": 524, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3688322603702545, + "timestamp": "2025-09-05 08:53:22.167749", + "step": 525, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:22.418586", + "step": 525, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30737367272377014, + "timestamp": "2025-09-05 08:53:22.420423", + "step": 526, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:22.627783", + "step": 526, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3638547956943512, + "timestamp": "2025-09-05 08:53:22.632266", + "step": 527, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:22.840299", + "step": 527, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3204697072505951, + "timestamp": "2025-09-05 08:53:22.858692", + "step": 528, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:23.129245", + "step": 528, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33960676193237305, + "timestamp": "2025-09-05 08:53:23.131263", + "step": 529, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:23.336988", + "step": 529, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26278024911880493, + "timestamp": "2025-09-05 08:53:23.343774", + "step": 530, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:23.545603", + "step": 530, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46334969997406006, + "timestamp": "2025-09-05 08:53:23.547858", + "step": 531, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:23.748378", + "step": 531, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3730030059814453, + "timestamp": "2025-09-05 08:53:23.763616", + "step": 532, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:23.955681", + "step": 532, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3750768303871155, + "timestamp": "2025-09-05 08:53:23.958740", + "step": 533, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:24.159708", + "step": 533, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4468510150909424, + "timestamp": "2025-09-05 08:53:24.162295", + "step": 534, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:24.358945", + "step": 534, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33164605498313904, + "timestamp": "2025-09-05 08:53:24.361476", + "step": 535, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:24.560635", + "step": 535, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41563618183135986, + "timestamp": "2025-09-05 08:53:24.583314", + "step": 536, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:24.783434", + "step": 536, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2689324915409088, + "timestamp": "2025-09-05 08:53:24.786279", + "step": 537, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:53:24.996809", + "step": 537, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4886358678340912, + "timestamp": "2025-09-05 08:53:24.998662", + "step": 538, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:25.202991", + "step": 538, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31535568833351135, + "timestamp": "2025-09-05 08:53:25.205119", + "step": 539, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:25.415088", + "step": 539, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30104270577430725, + "timestamp": "2025-09-05 08:53:25.430097", + "step": 540, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:30.312544", + "step": 540, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.515503396161826, + "timestamp": "2025-09-05 08:53:30.316745", + "step": 540, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:30.481458", + "step": 540, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38238897919654846, + "timestamp": "2025-09-05 08:53:30.483585", + "step": 541, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:30.652504", + "step": 541, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3997941017150879, + "timestamp": "2025-09-05 08:53:30.656351", + "step": 542, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:30.862613", + "step": 542, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38764917850494385, + "timestamp": "2025-09-05 08:53:30.865289", + "step": 543, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:31.115871", + "step": 543, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2606298327445984, + "timestamp": "2025-09-05 08:53:31.132908", + "step": 544, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:31.331423", + "step": 544, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4200422465801239, + "timestamp": "2025-09-05 08:53:31.333568", + "step": 545, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:31.542945", + "step": 545, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30847230553627014, + "timestamp": "2025-09-05 08:53:31.545735", + "step": 546, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:31.755943", + "step": 546, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3630848228931427, + "timestamp": "2025-09-05 08:53:31.758184", + "step": 547, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:53:31.961967", + "step": 547, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4338149428367615, + "timestamp": "2025-09-05 08:53:31.982517", + "step": 548, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:32.181564", + "step": 548, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43823084235191345, + "timestamp": "2025-09-05 08:53:32.183709", + "step": 549, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:32.393487", + "step": 549, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5095869898796082, + "timestamp": "2025-09-05 08:53:32.395847", + "step": 550, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:32.607294", + "step": 550, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4471019506454468, + "timestamp": "2025-09-05 08:53:32.609524", + "step": 551, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:32.821073", + "step": 551, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2290228307247162, + "timestamp": "2025-09-05 08:53:32.839258", + "step": 552, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:33.041860", + "step": 552, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23595690727233887, + "timestamp": "2025-09-05 08:53:33.044168", + "step": 553, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:53:33.253035", + "step": 553, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19498829543590546, + "timestamp": "2025-09-05 08:53:33.255795", + "step": 554, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:33.465968", + "step": 554, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30002859234809875, + "timestamp": "2025-09-05 08:53:33.467878", + "step": 555, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:33.681230", + "step": 555, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3351731300354004, + "timestamp": "2025-09-05 08:53:33.695980", + "step": 556, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:33.940273", + "step": 556, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3931390345096588, + "timestamp": "2025-09-05 08:53:33.943284", + "step": 557, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:34.150261", + "step": 557, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28822633624076843, + "timestamp": "2025-09-05 08:53:34.153543", + "step": 558, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:53:34.351789", + "step": 558, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32444193959236145, + "timestamp": "2025-09-05 08:53:34.354902", + "step": 559, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:34.561937", + "step": 559, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3384038209915161, + "timestamp": "2025-09-05 08:53:34.621017", + "step": 560, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:39.663441", + "step": 560, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.60952474637825, + "timestamp": "2025-09-05 08:53:39.666181", + "step": 560, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 560", + "timestamp": "2025-09-05 08:53:40.136439", + "step": 560, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:40.320945", + "step": 560, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2801932096481323, + "timestamp": "2025-09-05 08:53:40.323253", + "step": 561, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:40.532084", + "step": 561, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41452479362487793, + "timestamp": "2025-09-05 08:53:40.534062", + "step": 562, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:40.735016", + "step": 562, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24028798937797546, + "timestamp": "2025-09-05 08:53:40.737869", + "step": 563, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:40.937501", + "step": 563, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3565094769001007, + "timestamp": "2025-09-05 08:53:40.952391", + "step": 564, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:41.150448", + "step": 564, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.493753045797348, + "timestamp": "2025-09-05 08:53:41.152826", + "step": 565, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:41.350962", + "step": 565, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3578570485115051, + "timestamp": "2025-09-05 08:53:41.353796", + "step": 566, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:41.562560", + "step": 566, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3159957230091095, + "timestamp": "2025-09-05 08:53:41.584877", + "step": 567, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:41.804368", + "step": 567, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3041984736919403, + "timestamp": "2025-09-05 08:53:41.819203", + "step": 568, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:42.007943", + "step": 568, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32441309094429016, + "timestamp": "2025-09-05 08:53:42.011035", + "step": 569, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:42.219961", + "step": 569, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2862820029258728, + "timestamp": "2025-09-05 08:53:42.222290", + "step": 570, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:42.424797", + "step": 570, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3482998013496399, + "timestamp": "2025-09-05 08:53:42.426971", + "step": 571, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:42.628721", + "step": 571, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31323230266571045, + "timestamp": "2025-09-05 08:53:42.646181", + "step": 572, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:42.835879", + "step": 572, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.348871648311615, + "timestamp": "2025-09-05 08:53:42.837879", + "step": 573, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:43.034316", + "step": 573, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3106140196323395, + "timestamp": "2025-09-05 08:53:43.036776", + "step": 574, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:43.254863", + "step": 574, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47986507415771484, + "timestamp": "2025-09-05 08:53:43.256993", + "step": 575, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:43.455155", + "step": 575, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4856138229370117, + "timestamp": "2025-09-05 08:53:43.469870", + "step": 576, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:43.659035", + "step": 576, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23853756487369537, + "timestamp": "2025-09-05 08:53:43.661541", + "step": 577, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:43.837556", + "step": 577, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34495970606803894, + "timestamp": "2025-09-05 08:53:43.840008", + "step": 578, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:44.024806", + "step": 578, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3690643906593323, + "timestamp": "2025-09-05 08:53:44.028077", + "step": 579, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:44.212378", + "step": 579, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33360567688941956, + "timestamp": "2025-09-05 08:53:44.229347", + "step": 580, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:49.114108", + "step": 580, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.10085432045332, + "timestamp": "2025-09-05 08:53:49.117763", + "step": 580, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:49.263689", + "step": 580, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3735733926296234, + "timestamp": "2025-09-05 08:53:49.265593", + "step": 581, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:49.413395", + "step": 581, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4649202227592468, + "timestamp": "2025-09-05 08:53:49.420816", + "step": 582, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:49.651509", + "step": 582, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3592207431793213, + "timestamp": "2025-09-05 08:53:49.653601", + "step": 583, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:49.843228", + "step": 583, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45228156447410583, + "timestamp": "2025-09-05 08:53:49.861343", + "step": 584, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:50.039570", + "step": 584, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31664058566093445, + "timestamp": "2025-09-05 08:53:50.042225", + "step": 585, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:50.218017", + "step": 585, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33015117049217224, + "timestamp": "2025-09-05 08:53:50.220194", + "step": 586, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:50.400985", + "step": 586, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.367570698261261, + "timestamp": "2025-09-05 08:53:50.404104", + "step": 587, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:50.611204", + "step": 587, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2882497310638428, + "timestamp": "2025-09-05 08:53:50.629492", + "step": 588, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:50.813865", + "step": 588, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37059807777404785, + "timestamp": "2025-09-05 08:53:50.817712", + "step": 589, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:51.023041", + "step": 589, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3063223361968994, + "timestamp": "2025-09-05 08:53:51.025798", + "step": 590, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:51.213430", + "step": 590, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45722252130508423, + "timestamp": "2025-09-05 08:53:51.256253", + "step": 591, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:51.435498", + "step": 591, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23291143774986267, + "timestamp": "2025-09-05 08:53:51.449993", + "step": 592, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:51.620906", + "step": 592, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3415684700012207, + "timestamp": "2025-09-05 08:53:51.624124", + "step": 593, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:51.808049", + "step": 593, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3442092537879944, + "timestamp": "2025-09-05 08:53:51.810700", + "step": 594, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:51.988789", + "step": 594, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27133557200431824, + "timestamp": "2025-09-05 08:53:51.991133", + "step": 595, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:52.243889", + "step": 595, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27585190534591675, + "timestamp": "2025-09-05 08:53:52.258810", + "step": 596, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:52.426840", + "step": 596, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41854625940322876, + "timestamp": "2025-09-05 08:53:52.428957", + "step": 597, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:52.606477", + "step": 597, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4163596034049988, + "timestamp": "2025-09-05 08:53:52.609183", + "step": 598, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:53:52.794736", + "step": 598, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24686819314956665, + "timestamp": "2025-09-05 08:53:52.797153", + "step": 599, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:53:52.993744", + "step": 599, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27659153938293457, + "timestamp": "2025-09-05 08:53:53.008373", + "step": 600, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:53:58.070605", + "step": 600, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.150732301423496, + "timestamp": "2025-09-05 08:53:58.072774", + "step": 600, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 600", + "timestamp": "2025-09-05 08:53:58.524032", + "step": 600, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:53:58.695316", + "step": 600, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36288002133369446, + "timestamp": "2025-09-05 08:53:58.697367", + "step": 601, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:58.904083", + "step": 601, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2768324017524719, + "timestamp": "2025-09-05 08:53:58.906425", + "step": 602, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:53:59.104371", + "step": 602, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3677636682987213, + "timestamp": "2025-09-05 08:53:59.106677", + "step": 603, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:59.315197", + "step": 603, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5175231099128723, + "timestamp": "2025-09-05 08:53:59.329583", + "step": 604, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:53:59.521505", + "step": 604, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30161038041114807, + "timestamp": "2025-09-05 08:53:59.524024", + "step": 605, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:53:59.720228", + "step": 605, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19638413190841675, + "timestamp": "2025-09-05 08:53:59.723539", + "step": 606, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:53:59.931213", + "step": 606, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.403491348028183, + "timestamp": "2025-09-05 08:53:59.933056", + "step": 607, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:00.142014", + "step": 607, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5172940492630005, + "timestamp": "2025-09-05 08:54:00.158712", + "step": 608, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:00.360444", + "step": 608, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43559950590133667, + "timestamp": "2025-09-05 08:54:00.363569", + "step": 609, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:00.575220", + "step": 609, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39010724425315857, + "timestamp": "2025-09-05 08:54:00.578011", + "step": 610, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:00.785308", + "step": 610, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38972899317741394, + "timestamp": "2025-09-05 08:54:00.788490", + "step": 611, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:54:00.998969", + "step": 611, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26329943537712097, + "timestamp": "2025-09-05 08:54:01.015379", + "step": 612, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:01.213718", + "step": 612, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47966381907463074, + "timestamp": "2025-09-05 08:54:01.216175", + "step": 613, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:01.426283", + "step": 613, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3759992718696594, + "timestamp": "2025-09-05 08:54:01.429304", + "step": 614, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:01.648956", + "step": 614, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43966683745384216, + "timestamp": "2025-09-05 08:54:01.650896", + "step": 615, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:01.857825", + "step": 615, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2430787980556488, + "timestamp": "2025-09-05 08:54:01.872042", + "step": 616, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:02.064318", + "step": 616, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3986109793186188, + "timestamp": "2025-09-05 08:54:02.066439", + "step": 617, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:02.314801", + "step": 617, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34388166666030884, + "timestamp": "2025-09-05 08:54:02.347074", + "step": 618, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:02.556618", + "step": 618, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23649443686008453, + "timestamp": "2025-09-05 08:54:02.559154", + "step": 619, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:02.759787", + "step": 619, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3429713249206543, + "timestamp": "2025-09-05 08:54:02.776171", + "step": 620, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:07.720311", + "step": 620, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.0059737944772, + "timestamp": "2025-09-05 08:54:07.722616", + "step": 620, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:07.886060", + "step": 620, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38172709941864014, + "timestamp": "2025-09-05 08:54:07.888163", + "step": 621, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:08.085941", + "step": 621, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29247331619262695, + "timestamp": "2025-09-05 08:54:08.089072", + "step": 622, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:08.289376", + "step": 622, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30984073877334595, + "timestamp": "2025-09-05 08:54:08.291291", + "step": 623, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:54:08.489187", + "step": 623, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4325138032436371, + "timestamp": "2025-09-05 08:54:08.504002", + "step": 624, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:08.693990", + "step": 624, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38128820061683655, + "timestamp": "2025-09-05 08:54:08.697641", + "step": 625, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:54:08.897279", + "step": 625, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34949132800102234, + "timestamp": "2025-09-05 08:54:08.899825", + "step": 626, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:09.107520", + "step": 626, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21804000437259674, + "timestamp": "2025-09-05 08:54:09.109945", + "step": 627, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:09.318482", + "step": 627, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31279170513153076, + "timestamp": "2025-09-05 08:54:09.334969", + "step": 628, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:09.524559", + "step": 628, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4154609441757202, + "timestamp": "2025-09-05 08:54:09.527910", + "step": 629, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:09.724828", + "step": 629, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36364659667015076, + "timestamp": "2025-09-05 08:54:09.727232", + "step": 630, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:09.923511", + "step": 630, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.566509485244751, + "timestamp": "2025-09-05 08:54:09.925616", + "step": 631, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:10.122125", + "step": 631, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2817482054233551, + "timestamp": "2025-09-05 08:54:10.136938", + "step": 632, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:10.328401", + "step": 632, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4373268485069275, + "timestamp": "2025-09-05 08:54:10.330653", + "step": 633, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:10.537850", + "step": 633, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2138715237379074, + "timestamp": "2025-09-05 08:54:10.540845", + "step": 634, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:10.738631", + "step": 634, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3299078643321991, + "timestamp": "2025-09-05 08:54:10.740930", + "step": 635, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:10.937830", + "step": 635, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38988572359085083, + "timestamp": "2025-09-05 08:54:10.952014", + "step": 636, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:11.142159", + "step": 636, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5013632774353027, + "timestamp": "2025-09-05 08:54:11.146725", + "step": 637, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:11.353235", + "step": 637, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25191569328308105, + "timestamp": "2025-09-05 08:54:11.355726", + "step": 638, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:11.552894", + "step": 638, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3327215313911438, + "timestamp": "2025-09-05 08:54:11.556011", + "step": 639, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:11.752920", + "step": 639, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3358171880245209, + "timestamp": "2025-09-05 08:54:11.767103", + "step": 640, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:16.930895", + "step": 640, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.013876298957854, + "timestamp": "2025-09-05 08:54:16.932912", + "step": 640, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 640", + "timestamp": "2025-09-05 08:54:17.406898", + "step": 640, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:17.592032", + "step": 640, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49637261033058167, + "timestamp": "2025-09-05 08:54:17.593921", + "step": 641, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:17.799004", + "step": 641, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33820703625679016, + "timestamp": "2025-09-05 08:54:17.801216", + "step": 642, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:18.013549", + "step": 642, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3308141529560089, + "timestamp": "2025-09-05 08:54:18.016186", + "step": 643, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:18.258292", + "step": 643, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40199655294418335, + "timestamp": "2025-09-05 08:54:18.273111", + "step": 644, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:18.462659", + "step": 644, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30657947063446045, + "timestamp": "2025-09-05 08:54:18.464765", + "step": 645, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:18.665532", + "step": 645, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3859306573867798, + "timestamp": "2025-09-05 08:54:18.667913", + "step": 646, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:18.876768", + "step": 646, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3470034599304199, + "timestamp": "2025-09-05 08:54:18.879140", + "step": 647, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:19.110177", + "step": 647, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3591715395450592, + "timestamp": "2025-09-05 08:54:19.125306", + "step": 648, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:19.314904", + "step": 648, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29138100147247314, + "timestamp": "2025-09-05 08:54:19.317592", + "step": 649, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:19.526396", + "step": 649, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48001521825790405, + "timestamp": "2025-09-05 08:54:19.529551", + "step": 650, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:19.734777", + "step": 650, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30436667799949646, + "timestamp": "2025-09-05 08:54:19.738102", + "step": 651, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:19.944375", + "step": 651, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41145777702331543, + "timestamp": "2025-09-05 08:54:19.960171", + "step": 652, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:54:20.150221", + "step": 652, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4586491584777832, + "timestamp": "2025-09-05 08:54:20.152746", + "step": 653, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:20.359771", + "step": 653, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2969052791595459, + "timestamp": "2025-09-05 08:54:20.362605", + "step": 654, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:20.572199", + "step": 654, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4161323010921478, + "timestamp": "2025-09-05 08:54:20.575376", + "step": 655, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:20.782477", + "step": 655, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36784666776657104, + "timestamp": "2025-09-05 08:54:20.799260", + "step": 656, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:21.003719", + "step": 656, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30674105882644653, + "timestamp": "2025-09-05 08:54:21.005922", + "step": 657, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:21.204382", + "step": 657, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3142521381378174, + "timestamp": "2025-09-05 08:54:21.207457", + "step": 658, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:21.421749", + "step": 658, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4763060212135315, + "timestamp": "2025-09-05 08:54:21.425096", + "step": 659, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:21.631974", + "step": 659, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2550380825996399, + "timestamp": "2025-09-05 08:54:21.646742", + "step": 660, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:26.783208", + "step": 660, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.977893482287755, + "timestamp": "2025-09-05 08:54:26.785662", + "step": 660, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:26.946480", + "step": 660, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3140474855899811, + "timestamp": "2025-09-05 08:54:26.949023", + "step": 661, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:27.117596", + "step": 661, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5123893022537231, + "timestamp": "2025-09-05 08:54:27.120661", + "step": 662, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:27.326935", + "step": 662, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3251909017562866, + "timestamp": "2025-09-05 08:54:27.328976", + "step": 663, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:27.525600", + "step": 663, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34844574332237244, + "timestamp": "2025-09-05 08:54:27.540739", + "step": 664, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:54:27.740184", + "step": 664, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4417649209499359, + "timestamp": "2025-09-05 08:54:27.823505", + "step": 665, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:28.118452", + "step": 665, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2881294786930084, + "timestamp": "2025-09-05 08:54:28.120838", + "step": 666, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:28.319720", + "step": 666, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3033196032047272, + "timestamp": "2025-09-05 08:54:28.322799", + "step": 667, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:28.532384", + "step": 667, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35287392139434814, + "timestamp": "2025-09-05 08:54:28.547149", + "step": 668, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:28.739925", + "step": 668, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3460994362831116, + "timestamp": "2025-09-05 08:54:28.816991", + "step": 669, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:29.071266", + "step": 669, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2680738866329193, + "timestamp": "2025-09-05 08:54:29.073805", + "step": 670, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:29.279231", + "step": 670, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3204239010810852, + "timestamp": "2025-09-05 08:54:29.286163", + "step": 671, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:29.489604", + "step": 671, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2820996344089508, + "timestamp": "2025-09-05 08:54:29.508998", + "step": 672, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:29.798942", + "step": 672, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3304928243160248, + "timestamp": "2025-09-05 08:54:29.801447", + "step": 673, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:30.010222", + "step": 673, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3490171730518341, + "timestamp": "2025-09-05 08:54:30.013730", + "step": 674, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:30.224578", + "step": 674, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39344292879104614, + "timestamp": "2025-09-05 08:54:30.227117", + "step": 675, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:30.423153", + "step": 675, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2898615300655365, + "timestamp": "2025-09-05 08:54:30.440851", + "step": 676, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:30.692418", + "step": 676, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4104023575782776, + "timestamp": "2025-09-05 08:54:30.696366", + "step": 677, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:30.902770", + "step": 677, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30414190888404846, + "timestamp": "2025-09-05 08:54:30.905070", + "step": 678, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:31.103059", + "step": 678, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4684240520000458, + "timestamp": "2025-09-05 08:54:31.106036", + "step": 679, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:54:31.312497", + "step": 679, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34773412346839905, + "timestamp": "2025-09-05 08:54:31.328594", + "step": 680, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:36.267599", + "step": 680, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.13139868121867, + "timestamp": "2025-09-05 08:54:36.271850", + "step": 680, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 680", + "timestamp": "2025-09-05 08:54:36.750503", + "step": 680, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:36.998015", + "step": 680, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3354809880256653, + "timestamp": "2025-09-05 08:54:37.000382", + "step": 681, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:37.200209", + "step": 681, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45760607719421387, + "timestamp": "2025-09-05 08:54:37.203851", + "step": 682, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:37.402177", + "step": 682, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19389431178569794, + "timestamp": "2025-09-05 08:54:37.404069", + "step": 683, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:37.603549", + "step": 683, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45813360810279846, + "timestamp": "2025-09-05 08:54:37.621567", + "step": 684, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:54:37.823186", + "step": 684, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4114960730075836, + "timestamp": "2025-09-05 08:54:37.825184", + "step": 685, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:38.029268", + "step": 685, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.320042222738266, + "timestamp": "2025-09-05 08:54:38.032156", + "step": 686, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:54:38.228650", + "step": 686, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3017887473106384, + "timestamp": "2025-09-05 08:54:38.230931", + "step": 687, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:38.438526", + "step": 687, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19925743341445923, + "timestamp": "2025-09-05 08:54:38.455228", + "step": 688, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:38.655031", + "step": 688, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3546088635921478, + "timestamp": "2025-09-05 08:54:38.656855", + "step": 689, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:38.855417", + "step": 689, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2611202001571655, + "timestamp": "2025-09-05 08:54:38.857665", + "step": 690, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:39.064991", + "step": 690, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4317798912525177, + "timestamp": "2025-09-05 08:54:39.067201", + "step": 691, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:39.262353", + "step": 691, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3688901662826538, + "timestamp": "2025-09-05 08:54:39.279461", + "step": 692, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:39.475335", + "step": 692, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.288263201713562, + "timestamp": "2025-09-05 08:54:39.477507", + "step": 693, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:54:39.685377", + "step": 693, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38892319798469543, + "timestamp": "2025-09-05 08:54:39.687607", + "step": 694, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:39.894436", + "step": 694, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.334379643201828, + "timestamp": "2025-09-05 08:54:39.897342", + "step": 695, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:40.100425", + "step": 695, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37965139746665955, + "timestamp": "2025-09-05 08:54:40.116980", + "step": 696, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:40.314915", + "step": 696, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.380183607339859, + "timestamp": "2025-09-05 08:54:40.317039", + "step": 697, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:40.516922", + "step": 697, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37185657024383545, + "timestamp": "2025-09-05 08:54:40.519343", + "step": 698, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:40.717729", + "step": 698, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3406052589416504, + "timestamp": "2025-09-05 08:54:40.720247", + "step": 699, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:40.918093", + "step": 699, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29893720149993896, + "timestamp": "2025-09-05 08:54:40.933084", + "step": 700, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:45.631525", + "step": 700, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.328020476201374, + "timestamp": "2025-09-05 08:54:45.634419", + "step": 700, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:45.796220", + "step": 700, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2962213158607483, + "timestamp": "2025-09-05 08:54:45.798188", + "step": 701, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:46.001282", + "step": 701, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36525845527648926, + "timestamp": "2025-09-05 08:54:46.003188", + "step": 702, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:46.201526", + "step": 702, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46742987632751465, + "timestamp": "2025-09-05 08:54:46.203916", + "step": 703, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:46.399294", + "step": 703, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26656708121299744, + "timestamp": "2025-09-05 08:54:46.416569", + "step": 704, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:46.614553", + "step": 704, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3398812711238861, + "timestamp": "2025-09-05 08:54:46.616644", + "step": 705, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:46.811613", + "step": 705, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31698310375213623, + "timestamp": "2025-09-05 08:54:46.813745", + "step": 706, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:47.010716", + "step": 706, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3885927200317383, + "timestamp": "2025-09-05 08:54:47.012461", + "step": 707, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:47.216767", + "step": 707, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26296836137771606, + "timestamp": "2025-09-05 08:54:47.230773", + "step": 708, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:47.427633", + "step": 708, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41065317392349243, + "timestamp": "2025-09-05 08:54:47.429933", + "step": 709, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:47.626215", + "step": 709, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3585359752178192, + "timestamp": "2025-09-05 08:54:47.628728", + "step": 710, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:54:47.824583", + "step": 710, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40104833245277405, + "timestamp": "2025-09-05 08:54:47.828309", + "step": 711, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:48.022731", + "step": 711, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4005032777786255, + "timestamp": "2025-09-05 08:54:48.036802", + "step": 712, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:48.224089", + "step": 712, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3810969591140747, + "timestamp": "2025-09-05 08:54:48.226238", + "step": 713, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:48.421427", + "step": 713, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4647989571094513, + "timestamp": "2025-09-05 08:54:48.423241", + "step": 714, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:48.627565", + "step": 714, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26305657625198364, + "timestamp": "2025-09-05 08:54:48.629519", + "step": 715, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:48.826866", + "step": 715, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3350170850753784, + "timestamp": "2025-09-05 08:54:48.841368", + "step": 716, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:49.030348", + "step": 716, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2926214337348938, + "timestamp": "2025-09-05 08:54:49.032122", + "step": 717, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:49.239250", + "step": 717, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4400876760482788, + "timestamp": "2025-09-05 08:54:49.241213", + "step": 718, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:49.448651", + "step": 718, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2703559994697571, + "timestamp": "2025-09-05 08:54:49.450304", + "step": 719, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:49.657250", + "step": 719, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1777232140302658, + "timestamp": "2025-09-05 08:54:49.673678", + "step": 720, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:54:54.347893", + "step": 720, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.46760355156991, + "timestamp": "2025-09-05 08:54:54.349759", + "step": 720, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 720", + "timestamp": "2025-09-05 08:54:54.823525", + "step": 720, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:55.017249", + "step": 720, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3396753668785095, + "timestamp": "2025-09-05 08:54:55.019130", + "step": 721, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:55.227345", + "step": 721, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45180007815361023, + "timestamp": "2025-09-05 08:54:55.228947", + "step": 722, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:55.424212", + "step": 722, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43839141726493835, + "timestamp": "2025-09-05 08:54:55.426285", + "step": 723, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:55.623172", + "step": 723, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38653379678726196, + "timestamp": "2025-09-05 08:54:55.640180", + "step": 724, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:55.831590", + "step": 724, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.302320271730423, + "timestamp": "2025-09-05 08:54:55.833560", + "step": 725, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:56.038980", + "step": 725, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3913811147212982, + "timestamp": "2025-09-05 08:54:56.040877", + "step": 726, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:56.246521", + "step": 726, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2894538640975952, + "timestamp": "2025-09-05 08:54:56.248271", + "step": 727, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:56.452672", + "step": 727, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29082658886909485, + "timestamp": "2025-09-05 08:54:56.469163", + "step": 728, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:56.664440", + "step": 728, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4690100848674774, + "timestamp": "2025-09-05 08:54:56.667290", + "step": 729, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:56.863708", + "step": 729, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4146786332130432, + "timestamp": "2025-09-05 08:54:56.866143", + "step": 730, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:54:57.061283", + "step": 730, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4398282766342163, + "timestamp": "2025-09-05 08:54:57.063210", + "step": 731, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:57.259524", + "step": 731, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3548581600189209, + "timestamp": "2025-09-05 08:54:57.273889", + "step": 732, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:57.459570", + "step": 732, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2405700534582138, + "timestamp": "2025-09-05 08:54:57.461338", + "step": 733, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:57.657548", + "step": 733, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40556153655052185, + "timestamp": "2025-09-05 08:54:57.659320", + "step": 734, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:54:57.823137", + "step": 734, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3558599650859833, + "timestamp": "2025-09-05 08:54:57.825123", + "step": 735, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:58.028019", + "step": 735, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3774275779724121, + "timestamp": "2025-09-05 08:54:58.042770", + "step": 736, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:54:58.242072", + "step": 736, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4390884339809418, + "timestamp": "2025-09-05 08:54:58.245108", + "step": 737, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:54:58.441939", + "step": 737, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27866774797439575, + "timestamp": "2025-09-05 08:54:58.451705", + "step": 738, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:54:58.648957", + "step": 738, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2765721082687378, + "timestamp": "2025-09-05 08:54:58.653039", + "step": 739, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:54:58.848295", + "step": 739, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29264095425605774, + "timestamp": "2025-09-05 08:54:58.862799", + "step": 740, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:03.526058", + "step": 740, + "epoch": 1 + }, + { + "type": "pplx", + "content": 60.057011510032595, + "timestamp": "2025-09-05 08:55:03.528500", + "step": 740, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:03.689395", + "step": 740, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24096165597438812, + "timestamp": "2025-09-05 08:55:03.691725", + "step": 741, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:03.861152", + "step": 741, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.276774525642395, + "timestamp": "2025-09-05 08:55:03.863850", + "step": 742, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:04.067519", + "step": 742, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4308259189128876, + "timestamp": "2025-09-05 08:55:04.069692", + "step": 743, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:04.275651", + "step": 743, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2998303472995758, + "timestamp": "2025-09-05 08:55:04.291124", + "step": 744, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:55:04.479757", + "step": 744, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3581094443798065, + "timestamp": "2025-09-05 08:55:04.481424", + "step": 745, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:04.686029", + "step": 745, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44069504737854004, + "timestamp": "2025-09-05 08:55:04.687941", + "step": 746, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:04.892816", + "step": 746, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2966330647468567, + "timestamp": "2025-09-05 08:55:04.894551", + "step": 747, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:05.091527", + "step": 747, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4456374943256378, + "timestamp": "2025-09-05 08:55:05.105860", + "step": 748, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:05.300254", + "step": 748, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38735565543174744, + "timestamp": "2025-09-05 08:55:05.301897", + "step": 749, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:05.496733", + "step": 749, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3685053884983063, + "timestamp": "2025-09-05 08:55:05.498685", + "step": 750, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:05.705472", + "step": 750, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3653118908405304, + "timestamp": "2025-09-05 08:55:05.707251", + "step": 751, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:05.911657", + "step": 751, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46427515149116516, + "timestamp": "2025-09-05 08:55:05.925953", + "step": 752, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:06.115561", + "step": 752, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33055830001831055, + "timestamp": "2025-09-05 08:55:06.117285", + "step": 753, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:06.321142", + "step": 753, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3238837718963623, + "timestamp": "2025-09-05 08:55:06.322865", + "step": 754, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:06.518158", + "step": 754, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33274197578430176, + "timestamp": "2025-09-05 08:55:06.519817", + "step": 755, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:06.716279", + "step": 755, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6049551367759705, + "timestamp": "2025-09-05 08:55:06.732935", + "step": 756, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:06.929062", + "step": 756, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4643118977546692, + "timestamp": "2025-09-05 08:55:06.930811", + "step": 757, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:07.096596", + "step": 757, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37005674839019775, + "timestamp": "2025-09-05 08:55:07.098944", + "step": 758, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:07.306042", + "step": 758, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.270403116941452, + "timestamp": "2025-09-05 08:55:07.307925", + "step": 759, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:07.512213", + "step": 759, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3156965374946594, + "timestamp": "2025-09-05 08:55:07.521310", + "step": 760, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:12.156535", + "step": 760, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.66704724636567, + "timestamp": "2025-09-05 08:55:12.158506", + "step": 760, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 760", + "timestamp": "2025-09-05 08:55:12.626072", + "step": 760, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:12.793769", + "step": 760, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24948005378246307, + "timestamp": "2025-09-05 08:55:12.795644", + "step": 761, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:12.995458", + "step": 761, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33802419900894165, + "timestamp": "2025-09-05 08:55:12.997015", + "step": 762, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:13.200557", + "step": 762, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2108272910118103, + "timestamp": "2025-09-05 08:55:13.202057", + "step": 763, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:13.397743", + "step": 763, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31162819266319275, + "timestamp": "2025-09-05 08:55:13.411933", + "step": 764, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:13.598710", + "step": 764, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44521042704582214, + "timestamp": "2025-09-05 08:55:13.600271", + "step": 765, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:13.795524", + "step": 765, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3431508243083954, + "timestamp": "2025-09-05 08:55:13.797365", + "step": 766, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:14.002948", + "step": 766, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31309953331947327, + "timestamp": "2025-09-05 08:55:14.004890", + "step": 767, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:14.199262", + "step": 767, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2508161664009094, + "timestamp": "2025-09-05 08:55:14.216268", + "step": 768, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:14.416044", + "step": 768, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35422399640083313, + "timestamp": "2025-09-05 08:55:14.417865", + "step": 769, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:14.625474", + "step": 769, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39631184935569763, + "timestamp": "2025-09-05 08:55:14.627596", + "step": 770, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:55:14.834274", + "step": 770, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35908469557762146, + "timestamp": "2025-09-05 08:55:14.836925", + "step": 771, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:15.033942", + "step": 771, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44913047552108765, + "timestamp": "2025-09-05 08:55:15.050281", + "step": 772, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:15.248779", + "step": 772, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35234954953193665, + "timestamp": "2025-09-05 08:55:15.250645", + "step": 773, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:15.455657", + "step": 773, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26836010813713074, + "timestamp": "2025-09-05 08:55:15.457801", + "step": 774, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:15.655351", + "step": 774, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4194760024547577, + "timestamp": "2025-09-05 08:55:15.657289", + "step": 775, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:15.857106", + "step": 775, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4029027223587036, + "timestamp": "2025-09-05 08:55:15.871310", + "step": 776, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:16.067439", + "step": 776, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3827188014984131, + "timestamp": "2025-09-05 08:55:16.069132", + "step": 777, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:16.274296", + "step": 777, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3042537569999695, + "timestamp": "2025-09-05 08:55:16.276538", + "step": 778, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:16.485146", + "step": 778, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3845559358596802, + "timestamp": "2025-09-05 08:55:16.486916", + "step": 779, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:16.682919", + "step": 779, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3793331980705261, + "timestamp": "2025-09-05 08:55:16.696900", + "step": 780, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:21.332508", + "step": 780, + "epoch": 1 + }, + { + "type": "pplx", + "content": 59.53960384330279, + "timestamp": "2025-09-05 08:55:21.334280", + "step": 780, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:21.495075", + "step": 780, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42627277970314026, + "timestamp": "2025-09-05 08:55:21.497499", + "step": 781, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:21.665091", + "step": 781, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2579745650291443, + "timestamp": "2025-09-05 08:55:21.666790", + "step": 782, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:21.872172", + "step": 782, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2808438539505005, + "timestamp": "2025-09-05 08:55:21.874532", + "step": 783, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:22.070858", + "step": 783, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2570682764053345, + "timestamp": "2025-09-05 08:55:22.080329", + "step": 784, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:22.242021", + "step": 784, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39906033873558044, + "timestamp": "2025-09-05 08:55:22.243659", + "step": 785, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:22.450233", + "step": 785, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.288041889667511, + "timestamp": "2025-09-05 08:55:22.452889", + "step": 786, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:22.621107", + "step": 786, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3034781217575073, + "timestamp": "2025-09-05 08:55:22.623097", + "step": 787, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:22.817939", + "step": 787, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3924509584903717, + "timestamp": "2025-09-05 08:55:22.827178", + "step": 788, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:22.991967", + "step": 788, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3101724684238434, + "timestamp": "2025-09-05 08:55:22.993571", + "step": 789, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:23.198900", + "step": 789, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35364800691604614, + "timestamp": "2025-09-05 08:55:23.200466", + "step": 790, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:23.395760", + "step": 790, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29069453477859497, + "timestamp": "2025-09-05 08:55:23.398959", + "step": 791, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:23.595445", + "step": 791, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2604013979434967, + "timestamp": "2025-09-05 08:55:23.611776", + "step": 792, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:23.809541", + "step": 792, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3716432452201843, + "timestamp": "2025-09-05 08:55:23.811288", + "step": 793, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:24.006648", + "step": 793, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2955947518348694, + "timestamp": "2025-09-05 08:55:24.008408", + "step": 794, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:24.204286", + "step": 794, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4398670792579651, + "timestamp": "2025-09-05 08:55:24.206033", + "step": 795, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:24.400419", + "step": 795, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39011335372924805, + "timestamp": "2025-09-05 08:55:24.414895", + "step": 796, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:55:24.610922", + "step": 796, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4034815728664398, + "timestamp": "2025-09-05 08:55:24.614525", + "step": 797, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:24.815788", + "step": 797, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33042198419570923, + "timestamp": "2025-09-05 08:55:24.817431", + "step": 798, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:25.022565", + "step": 798, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35239022970199585, + "timestamp": "2025-09-05 08:55:25.024113", + "step": 799, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:25.229432", + "step": 799, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38791370391845703, + "timestamp": "2025-09-05 08:55:25.243544", + "step": 800, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:29.889311", + "step": 800, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.95891625814123, + "timestamp": "2025-09-05 08:55:29.891231", + "step": 800, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 800", + "timestamp": "2025-09-05 08:55:30.365184", + "step": 800, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:55:30.534979", + "step": 800, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36771687865257263, + "timestamp": "2025-09-05 08:55:30.537058", + "step": 801, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:30.733797", + "step": 801, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.265923410654068, + "timestamp": "2025-09-05 08:55:30.735543", + "step": 802, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:30.940549", + "step": 802, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3522961437702179, + "timestamp": "2025-09-05 08:55:30.942341", + "step": 803, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:31.108326", + "step": 803, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3711682856082916, + "timestamp": "2025-09-05 08:55:31.124440", + "step": 804, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:31.321294", + "step": 804, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.282815158367157, + "timestamp": "2025-09-05 08:55:31.325387", + "step": 805, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:31.524159", + "step": 805, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3416968882083893, + "timestamp": "2025-09-05 08:55:31.527357", + "step": 806, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:31.725722", + "step": 806, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35129979252815247, + "timestamp": "2025-09-05 08:55:31.729004", + "step": 807, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:31.926674", + "step": 807, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4110635221004486, + "timestamp": "2025-09-05 08:55:31.943568", + "step": 808, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:32.141794", + "step": 808, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5824915766716003, + "timestamp": "2025-09-05 08:55:32.143517", + "step": 809, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:32.348023", + "step": 809, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4817580282688141, + "timestamp": "2025-09-05 08:55:32.350161", + "step": 810, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:32.555273", + "step": 810, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3244217038154602, + "timestamp": "2025-09-05 08:55:32.557016", + "step": 811, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:32.754146", + "step": 811, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.377000093460083, + "timestamp": "2025-09-05 08:55:32.763538", + "step": 812, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:32.928523", + "step": 812, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40550920367240906, + "timestamp": "2025-09-05 08:55:32.930496", + "step": 813, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:33.135757", + "step": 813, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3100672662258148, + "timestamp": "2025-09-05 08:55:33.138151", + "step": 814, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:33.346041", + "step": 814, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37069234251976013, + "timestamp": "2025-09-05 08:55:33.348524", + "step": 815, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:33.553773", + "step": 815, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3425869047641754, + "timestamp": "2025-09-05 08:55:33.568801", + "step": 816, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:33.756241", + "step": 816, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4293539524078369, + "timestamp": "2025-09-05 08:55:33.758599", + "step": 817, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:33.962790", + "step": 817, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31967583298683167, + "timestamp": "2025-09-05 08:55:33.964832", + "step": 818, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:34.160870", + "step": 818, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38507017493247986, + "timestamp": "2025-09-05 08:55:34.163142", + "step": 819, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:34.333933", + "step": 819, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42399299144744873, + "timestamp": "2025-09-05 08:55:34.349042", + "step": 820, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:39.001578", + "step": 820, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.07706559280723, + "timestamp": "2025-09-05 08:55:39.003502", + "step": 820, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:39.163673", + "step": 820, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3387710452079773, + "timestamp": "2025-09-05 08:55:39.165694", + "step": 821, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:39.333573", + "step": 821, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2951369285583496, + "timestamp": "2025-09-05 08:55:39.335550", + "step": 822, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:39.543460", + "step": 822, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35738304257392883, + "timestamp": "2025-09-05 08:55:39.546054", + "step": 823, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:39.742891", + "step": 823, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3632713258266449, + "timestamp": "2025-09-05 08:55:39.756880", + "step": 824, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:39.953686", + "step": 824, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37211427092552185, + "timestamp": "2025-09-05 08:55:39.955910", + "step": 825, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:40.162235", + "step": 825, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3463480472564697, + "timestamp": "2025-09-05 08:55:40.164065", + "step": 826, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:40.371292", + "step": 826, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30579307675361633, + "timestamp": "2025-09-05 08:55:40.373150", + "step": 827, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:40.580090", + "step": 827, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40940576791763306, + "timestamp": "2025-09-05 08:55:40.594902", + "step": 828, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:40.792047", + "step": 828, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36023303866386414, + "timestamp": "2025-09-05 08:55:40.793964", + "step": 829, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:40.990352", + "step": 829, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3491280972957611, + "timestamp": "2025-09-05 08:55:40.992165", + "step": 830, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:41.188266", + "step": 830, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24948126077651978, + "timestamp": "2025-09-05 08:55:41.189890", + "step": 831, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:41.387216", + "step": 831, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31294217705726624, + "timestamp": "2025-09-05 08:55:41.401260", + "step": 832, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:41.590851", + "step": 832, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4010050594806671, + "timestamp": "2025-09-05 08:55:41.592639", + "step": 833, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:41.788041", + "step": 833, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3963695466518402, + "timestamp": "2025-09-05 08:55:41.790319", + "step": 834, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:41.986059", + "step": 834, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4662969708442688, + "timestamp": "2025-09-05 08:55:41.988012", + "step": 835, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:42.185669", + "step": 835, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44282209873199463, + "timestamp": "2025-09-05 08:55:42.199987", + "step": 836, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:42.388364", + "step": 836, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2699596881866455, + "timestamp": "2025-09-05 08:55:42.390059", + "step": 837, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:42.555606", + "step": 837, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42487242817878723, + "timestamp": "2025-09-05 08:55:42.557823", + "step": 838, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:42.763983", + "step": 838, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25971612334251404, + "timestamp": "2025-09-05 08:55:42.766011", + "step": 839, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:55:42.964541", + "step": 839, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3642326593399048, + "timestamp": "2025-09-05 08:55:42.978888", + "step": 840, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:47.651856", + "step": 840, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.15736728059577, + "timestamp": "2025-09-05 08:55:47.653828", + "step": 840, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 840", + "timestamp": "2025-09-05 08:55:48.144064", + "step": 840, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:48.312663", + "step": 840, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.517951488494873, + "timestamp": "2025-09-05 08:55:48.314430", + "step": 841, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:48.508434", + "step": 841, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.297713965177536, + "timestamp": "2025-09-05 08:55:48.510702", + "step": 842, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:48.707704", + "step": 842, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4420361816883087, + "timestamp": "2025-09-05 08:55:48.709846", + "step": 843, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:48.876016", + "step": 843, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4004204273223877, + "timestamp": "2025-09-05 08:55:48.892549", + "step": 844, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:49.090938", + "step": 844, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3982437551021576, + "timestamp": "2025-09-05 08:55:49.092914", + "step": 845, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:49.261019", + "step": 845, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36752861738204956, + "timestamp": "2025-09-05 08:55:49.265198", + "step": 846, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:49.460827", + "step": 846, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2838650941848755, + "timestamp": "2025-09-05 08:55:49.463902", + "step": 847, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:49.673060", + "step": 847, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3069932758808136, + "timestamp": "2025-09-05 08:55:49.687326", + "step": 848, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:49.877164", + "step": 848, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43903687596321106, + "timestamp": "2025-09-05 08:55:49.879086", + "step": 849, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:50.075926", + "step": 849, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2802756428718567, + "timestamp": "2025-09-05 08:55:50.078314", + "step": 850, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:50.285280", + "step": 850, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24855327606201172, + "timestamp": "2025-09-05 08:55:50.287275", + "step": 851, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:50.489334", + "step": 851, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4161975681781769, + "timestamp": "2025-09-05 08:55:50.503796", + "step": 852, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:50.695106", + "step": 852, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45736148953437805, + "timestamp": "2025-09-05 08:55:50.698191", + "step": 853, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:50.903392", + "step": 853, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36564257740974426, + "timestamp": "2025-09-05 08:55:50.905223", + "step": 854, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:51.102891", + "step": 854, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37967172265052795, + "timestamp": "2025-09-05 08:55:51.105298", + "step": 855, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:51.304529", + "step": 855, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4026603400707245, + "timestamp": "2025-09-05 08:55:51.321055", + "step": 856, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:51.519946", + "step": 856, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22071857750415802, + "timestamp": "2025-09-05 08:55:51.522074", + "step": 857, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:51.720784", + "step": 857, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47518184781074524, + "timestamp": "2025-09-05 08:55:51.722581", + "step": 858, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:51.928566", + "step": 858, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3092558681964874, + "timestamp": "2025-09-05 08:55:51.930334", + "step": 859, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:52.127465", + "step": 859, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2976468801498413, + "timestamp": "2025-09-05 08:55:52.142114", + "step": 860, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:55:56.850637", + "step": 860, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.433954195954186, + "timestamp": "2025-09-05 08:55:56.852647", + "step": 860, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:57.015085", + "step": 860, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3771287798881531, + "timestamp": "2025-09-05 08:55:57.016999", + "step": 861, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:57.221522", + "step": 861, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4275374710559845, + "timestamp": "2025-09-05 08:55:57.223447", + "step": 862, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:57.432116", + "step": 862, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4731064438819885, + "timestamp": "2025-09-05 08:55:57.434290", + "step": 863, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:57.632130", + "step": 863, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2372477501630783, + "timestamp": "2025-09-05 08:55:57.646792", + "step": 864, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:57.846296", + "step": 864, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.328989714384079, + "timestamp": "2025-09-05 08:55:57.848392", + "step": 865, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:55:58.048431", + "step": 865, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43380752205848694, + "timestamp": "2025-09-05 08:55:58.050531", + "step": 866, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:58.248313", + "step": 866, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31248700618743896, + "timestamp": "2025-09-05 08:55:58.250155", + "step": 867, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:55:58.449096", + "step": 867, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3782794773578644, + "timestamp": "2025-09-05 08:55:58.463518", + "step": 868, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:55:58.655068", + "step": 868, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46563515067100525, + "timestamp": "2025-09-05 08:55:58.657655", + "step": 869, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:58.853817", + "step": 869, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30388137698173523, + "timestamp": "2025-09-05 08:55:58.855603", + "step": 870, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:59.065956", + "step": 870, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27287933230400085, + "timestamp": "2025-09-05 08:55:59.067801", + "step": 871, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:55:59.267381", + "step": 871, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23755168914794922, + "timestamp": "2025-09-05 08:55:59.284414", + "step": 872, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:59.482355", + "step": 872, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37823018431663513, + "timestamp": "2025-09-05 08:55:59.484362", + "step": 873, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:55:59.690986", + "step": 873, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4183447062969208, + "timestamp": "2025-09-05 08:55:59.694146", + "step": 874, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:55:59.902530", + "step": 874, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36634987592697144, + "timestamp": "2025-09-05 08:55:59.904278", + "step": 875, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:00.112019", + "step": 875, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5448222756385803, + "timestamp": "2025-09-05 08:56:00.126417", + "step": 876, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:00.323866", + "step": 876, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38981127738952637, + "timestamp": "2025-09-05 08:56:00.325658", + "step": 877, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:00.533301", + "step": 877, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3060409724712372, + "timestamp": "2025-09-05 08:56:00.535138", + "step": 878, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:00.732164", + "step": 878, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23060426115989685, + "timestamp": "2025-09-05 08:56:00.733955", + "step": 879, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:00.930960", + "step": 879, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3777828514575958, + "timestamp": "2025-09-05 08:56:00.945178", + "step": 880, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:05.651096", + "step": 880, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.9930927717275, + "timestamp": "2025-09-05 08:56:05.653602", + "step": 880, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 880", + "timestamp": "2025-09-05 08:56:06.119224", + "step": 880, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:06.284103", + "step": 880, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35168516635894775, + "timestamp": "2025-09-05 08:56:06.286070", + "step": 881, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:06.485122", + "step": 881, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4242359697818756, + "timestamp": "2025-09-05 08:56:06.487493", + "step": 882, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:06.658498", + "step": 882, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3520510792732239, + "timestamp": "2025-09-05 08:56:06.660398", + "step": 883, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:06.866185", + "step": 883, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2529168725013733, + "timestamp": "2025-09-05 08:56:06.880043", + "step": 884, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:07.068068", + "step": 884, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4118684232234955, + "timestamp": "2025-09-05 08:56:07.071747", + "step": 885, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:07.283773", + "step": 885, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2805653512477875, + "timestamp": "2025-09-05 08:56:07.290307", + "step": 886, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:07.498225", + "step": 886, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3658747971057892, + "timestamp": "2025-09-05 08:56:07.503757", + "step": 887, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:07.723427", + "step": 887, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5264832973480225, + "timestamp": "2025-09-05 08:56:07.738249", + "step": 888, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:07.931407", + "step": 888, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4625481069087982, + "timestamp": "2025-09-05 08:56:07.934631", + "step": 889, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:08.137874", + "step": 889, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2717623710632324, + "timestamp": "2025-09-05 08:56:08.141167", + "step": 890, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:08.355120", + "step": 890, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26788651943206787, + "timestamp": "2025-09-05 08:56:08.357651", + "step": 891, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:08.556363", + "step": 891, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43440476059913635, + "timestamp": "2025-09-05 08:56:08.572805", + "step": 892, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:08.763294", + "step": 892, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46988850831985474, + "timestamp": "2025-09-05 08:56:08.765546", + "step": 893, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:08.975242", + "step": 893, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35338684916496277, + "timestamp": "2025-09-05 08:56:08.977587", + "step": 894, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:09.196975", + "step": 894, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3955008089542389, + "timestamp": "2025-09-05 08:56:09.205880", + "step": 895, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:09.416322", + "step": 895, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4844711422920227, + "timestamp": "2025-09-05 08:56:09.435475", + "step": 896, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:09.636116", + "step": 896, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38304269313812256, + "timestamp": "2025-09-05 08:56:09.640059", + "step": 897, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:09.844074", + "step": 897, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4427229166030884, + "timestamp": "2025-09-05 08:56:09.846695", + "step": 898, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:10.052111", + "step": 898, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31587520241737366, + "timestamp": "2025-09-05 08:56:10.054379", + "step": 899, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:10.262257", + "step": 899, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2594102919101715, + "timestamp": "2025-09-05 08:56:10.276857", + "step": 900, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:14.955511", + "step": 900, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.8565162716663, + "timestamp": "2025-09-05 08:56:14.957342", + "step": 900, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:15.119484", + "step": 900, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3601221740245819, + "timestamp": "2025-09-05 08:56:15.121057", + "step": 901, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:15.326247", + "step": 901, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.296526163816452, + "timestamp": "2025-09-05 08:56:15.327954", + "step": 902, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:15.534820", + "step": 902, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2748814821243286, + "timestamp": "2025-09-05 08:56:15.537456", + "step": 903, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:15.735119", + "step": 903, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31274929642677307, + "timestamp": "2025-09-05 08:56:15.752496", + "step": 904, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:15.949953", + "step": 904, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45903486013412476, + "timestamp": "2025-09-05 08:56:15.952321", + "step": 905, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:16.160410", + "step": 905, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32286933064460754, + "timestamp": "2025-09-05 08:56:16.162171", + "step": 906, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:16.359899", + "step": 906, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3217444121837616, + "timestamp": "2025-09-05 08:56:16.361873", + "step": 907, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:16.556943", + "step": 907, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2954324185848236, + "timestamp": "2025-09-05 08:56:16.571357", + "step": 908, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:16.759132", + "step": 908, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3679000735282898, + "timestamp": "2025-09-05 08:56:16.761079", + "step": 909, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:16.959729", + "step": 909, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41062143445014954, + "timestamp": "2025-09-05 08:56:16.961531", + "step": 910, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:17.157829", + "step": 910, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36205968260765076, + "timestamp": "2025-09-05 08:56:17.160372", + "step": 911, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:17.356759", + "step": 911, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3409394323825836, + "timestamp": "2025-09-05 08:56:17.365973", + "step": 912, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:17.529633", + "step": 912, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31686097383499146, + "timestamp": "2025-09-05 08:56:17.531168", + "step": 913, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:17.697289", + "step": 913, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22255216538906097, + "timestamp": "2025-09-05 08:56:17.699236", + "step": 914, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:17.905578", + "step": 914, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30591610074043274, + "timestamp": "2025-09-05 08:56:17.907210", + "step": 915, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:18.103761", + "step": 915, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36227649450302124, + "timestamp": "2025-09-05 08:56:18.118540", + "step": 916, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:18.307208", + "step": 916, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29489269852638245, + "timestamp": "2025-09-05 08:56:18.308973", + "step": 917, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:18.506067", + "step": 917, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3366081118583679, + "timestamp": "2025-09-05 08:56:18.507597", + "step": 918, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:18.716065", + "step": 918, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2996135652065277, + "timestamp": "2025-09-05 08:56:18.717925", + "step": 919, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:18.916645", + "step": 919, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31638243794441223, + "timestamp": "2025-09-05 08:56:18.930921", + "step": 920, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:23.565560", + "step": 920, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.30528894827069, + "timestamp": "2025-09-05 08:56:23.567482", + "step": 920, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 920", + "timestamp": "2025-09-05 08:56:24.094332", + "step": 920, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:24.266169", + "step": 920, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31981584429740906, + "timestamp": "2025-09-05 08:56:24.268944", + "step": 921, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:24.440635", + "step": 921, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4614734649658203, + "timestamp": "2025-09-05 08:56:24.442107", + "step": 922, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:24.647745", + "step": 922, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3498396575450897, + "timestamp": "2025-09-05 08:56:24.649674", + "step": 923, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:24.855227", + "step": 923, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2465495467185974, + "timestamp": "2025-09-05 08:56:24.864527", + "step": 924, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:25.029444", + "step": 924, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5145090818405151, + "timestamp": "2025-09-05 08:56:25.031415", + "step": 925, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:25.238088", + "step": 925, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3693583607673645, + "timestamp": "2025-09-05 08:56:25.239819", + "step": 926, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:25.437346", + "step": 926, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33343541622161865, + "timestamp": "2025-09-05 08:56:25.439379", + "step": 927, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:25.644585", + "step": 927, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28294891119003296, + "timestamp": "2025-09-05 08:56:25.659020", + "step": 928, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:25.847035", + "step": 928, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3684309720993042, + "timestamp": "2025-09-05 08:56:25.848699", + "step": 929, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:26.044378", + "step": 929, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3786807060241699, + "timestamp": "2025-09-05 08:56:26.046519", + "step": 930, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:26.252989", + "step": 930, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2511294186115265, + "timestamp": "2025-09-05 08:56:26.254868", + "step": 931, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:26.451301", + "step": 931, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26673150062561035, + "timestamp": "2025-09-05 08:56:26.469187", + "step": 932, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:26.664664", + "step": 932, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2586221396923065, + "timestamp": "2025-09-05 08:56:26.667023", + "step": 933, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:26.861586", + "step": 933, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23219725489616394, + "timestamp": "2025-09-05 08:56:26.864119", + "step": 934, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:27.060475", + "step": 934, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3612004816532135, + "timestamp": "2025-09-05 08:56:27.062849", + "step": 935, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:27.258572", + "step": 935, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19465096294879913, + "timestamp": "2025-09-05 08:56:27.274029", + "step": 936, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:27.461704", + "step": 936, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2936038374900818, + "timestamp": "2025-09-05 08:56:27.464576", + "step": 937, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:27.661307", + "step": 937, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34567755460739136, + "timestamp": "2025-09-05 08:56:27.663756", + "step": 938, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:27.869389", + "step": 938, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.45169833302497864, + "timestamp": "2025-09-05 08:56:27.873145", + "step": 939, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:28.068724", + "step": 939, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41843459010124207, + "timestamp": "2025-09-05 08:56:28.083793", + "step": 940, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:32.727257", + "step": 940, + "epoch": 1 + }, + { + "type": "pplx", + "content": 57.96766332265921, + "timestamp": "2025-09-05 08:56:32.729095", + "step": 940, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:32.889362", + "step": 940, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33605149388313293, + "timestamp": "2025-09-05 08:56:32.891081", + "step": 941, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:33.057909", + "step": 941, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28017252683639526, + "timestamp": "2025-09-05 08:56:33.059718", + "step": 942, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:33.266409", + "step": 942, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32622429728507996, + "timestamp": "2025-09-05 08:56:33.268164", + "step": 943, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:33.435862", + "step": 943, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3794481158256531, + "timestamp": "2025-09-05 08:56:33.452555", + "step": 944, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:33.650667", + "step": 944, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32684630155563354, + "timestamp": "2025-09-05 08:56:33.652374", + "step": 945, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:33.848749", + "step": 945, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34910067915916443, + "timestamp": "2025-09-05 08:56:33.850516", + "step": 946, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:56:34.052773", + "step": 946, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39786896109580994, + "timestamp": "2025-09-05 08:56:34.054439", + "step": 947, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:34.261687", + "step": 947, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.410081684589386, + "timestamp": "2025-09-05 08:56:34.276444", + "step": 948, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:34.466684", + "step": 948, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5023239850997925, + "timestamp": "2025-09-05 08:56:34.468684", + "step": 949, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:34.664812", + "step": 949, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29403749108314514, + "timestamp": "2025-09-05 08:56:34.667965", + "step": 950, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:34.863532", + "step": 950, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2468639761209488, + "timestamp": "2025-09-05 08:56:34.865338", + "step": 951, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:35.063835", + "step": 951, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41007065773010254, + "timestamp": "2025-09-05 08:56:35.077372", + "step": 952, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:35.272719", + "step": 952, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26287195086479187, + "timestamp": "2025-09-05 08:56:35.275516", + "step": 953, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:35.442519", + "step": 953, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4006151854991913, + "timestamp": "2025-09-05 08:56:35.444495", + "step": 954, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:35.650340", + "step": 954, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25933578610420227, + "timestamp": "2025-09-05 08:56:35.653205", + "step": 955, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:35.856923", + "step": 955, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2387603372335434, + "timestamp": "2025-09-05 08:56:35.873711", + "step": 956, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:36.072514", + "step": 956, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3455435633659363, + "timestamp": "2025-09-05 08:56:36.074229", + "step": 957, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:36.270676", + "step": 957, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42304477095603943, + "timestamp": "2025-09-05 08:56:36.272861", + "step": 958, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:36.470360", + "step": 958, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5853138566017151, + "timestamp": "2025-09-05 08:56:36.472351", + "step": 959, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:36.678570", + "step": 959, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4282821714878082, + "timestamp": "2025-09-05 08:56:36.696107", + "step": 960, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:41.365028", + "step": 960, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.0195968066891, + "timestamp": "2025-09-05 08:56:41.367535", + "step": 960, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 960", + "timestamp": "2025-09-05 08:56:41.827347", + "step": 960, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:41.991345", + "step": 960, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38176435232162476, + "timestamp": "2025-09-05 08:56:41.992895", + "step": 961, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:42.188364", + "step": 961, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3238293528556824, + "timestamp": "2025-09-05 08:56:42.190547", + "step": 962, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:42.387838", + "step": 962, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37185245752334595, + "timestamp": "2025-09-05 08:56:42.389442", + "step": 963, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:42.587007", + "step": 963, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33915528655052185, + "timestamp": "2025-09-05 08:56:42.601403", + "step": 964, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:42.789914", + "step": 964, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4303402900695801, + "timestamp": "2025-09-05 08:56:42.792185", + "step": 965, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:42.989673", + "step": 965, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41477546095848083, + "timestamp": "2025-09-05 08:56:42.991198", + "step": 966, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:43.201206", + "step": 966, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3982813060283661, + "timestamp": "2025-09-05 08:56:43.202897", + "step": 967, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:43.399469", + "step": 967, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2835683822631836, + "timestamp": "2025-09-05 08:56:43.413180", + "step": 968, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:43.601190", + "step": 968, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39541733264923096, + "timestamp": "2025-09-05 08:56:43.606800", + "step": 969, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:43.812853", + "step": 969, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37673527002334595, + "timestamp": "2025-09-05 08:56:43.814961", + "step": 970, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:44.012986", + "step": 970, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43757012486457825, + "timestamp": "2025-09-05 08:56:44.014992", + "step": 971, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:44.219053", + "step": 971, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34792765974998474, + "timestamp": "2025-09-05 08:56:44.236654", + "step": 972, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:44.435682", + "step": 972, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2519514858722687, + "timestamp": "2025-09-05 08:56:44.437511", + "step": 973, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:44.644146", + "step": 973, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37428492307662964, + "timestamp": "2025-09-05 08:56:44.645999", + "step": 974, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:44.842027", + "step": 974, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4176678955554962, + "timestamp": "2025-09-05 08:56:44.843935", + "step": 975, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:45.041995", + "step": 975, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32866379618644714, + "timestamp": "2025-09-05 08:56:45.056419", + "step": 976, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:45.253585", + "step": 976, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3537275493144989, + "timestamp": "2025-09-05 08:56:45.255728", + "step": 977, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:45.462343", + "step": 977, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39139440655708313, + "timestamp": "2025-09-05 08:56:45.464065", + "step": 978, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:45.660402", + "step": 978, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47776591777801514, + "timestamp": "2025-09-05 08:56:45.662496", + "step": 979, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:45.870532", + "step": 979, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39145949482917786, + "timestamp": "2025-09-05 08:56:45.884767", + "step": 980, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:50.525816", + "step": 980, + "epoch": 1 + }, + { + "type": "pplx", + "content": 57.20121236465665, + "timestamp": "2025-09-05 08:56:50.527616", + "step": 980, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:50.691990", + "step": 980, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34527283906936646, + "timestamp": "2025-09-05 08:56:50.694050", + "step": 981, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:50.861530", + "step": 981, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43041056394577026, + "timestamp": "2025-09-05 08:56:50.863361", + "step": 982, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:51.068897", + "step": 982, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32550325989723206, + "timestamp": "2025-09-05 08:56:51.070801", + "step": 983, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:51.276949", + "step": 983, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3620685040950775, + "timestamp": "2025-09-05 08:56:51.291433", + "step": 984, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:51.479999", + "step": 984, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36788034439086914, + "timestamp": "2025-09-05 08:56:51.482550", + "step": 985, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:51.678037", + "step": 985, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.20247577130794525, + "timestamp": "2025-09-05 08:56:51.680666", + "step": 986, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:51.887299", + "step": 986, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.255616694688797, + "timestamp": "2025-09-05 08:56:51.889216", + "step": 987, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:52.085171", + "step": 987, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32949069142341614, + "timestamp": "2025-09-05 08:56:52.099490", + "step": 988, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:52.298089", + "step": 988, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2952141761779785, + "timestamp": "2025-09-05 08:56:52.300055", + "step": 989, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:56:52.495645", + "step": 989, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3354770541191101, + "timestamp": "2025-09-05 08:56:52.497604", + "step": 990, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:52.696906", + "step": 990, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4000490605831146, + "timestamp": "2025-09-05 08:56:52.699540", + "step": 991, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:52.895826", + "step": 991, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.400611937046051, + "timestamp": "2025-09-05 08:56:52.905288", + "step": 992, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:53.068126", + "step": 992, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41154736280441284, + "timestamp": "2025-09-05 08:56:53.069916", + "step": 993, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:53.289245", + "step": 993, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3314960300922394, + "timestamp": "2025-09-05 08:56:53.291054", + "step": 994, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:53.499477", + "step": 994, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3384896218776703, + "timestamp": "2025-09-05 08:56:53.501707", + "step": 995, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:56:53.708716", + "step": 995, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24565917253494263, + "timestamp": "2025-09-05 08:56:53.723148", + "step": 996, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:53.914157", + "step": 996, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30330580472946167, + "timestamp": "2025-09-05 08:56:53.915892", + "step": 997, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:56:54.113693", + "step": 997, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.339424192905426, + "timestamp": "2025-09-05 08:56:54.116077", + "step": 998, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:54.321917", + "step": 998, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3887946605682373, + "timestamp": "2025-09-05 08:56:54.323779", + "step": 999, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:56:54.523115", + "step": 999, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3614341616630554, + "timestamp": "2025-09-05 08:56:54.537460", + "step": 1000, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:56:59.200485", + "step": 1000, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.91533872503707, + "timestamp": "2025-09-05 08:56:59.202606", + "step": 1000, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1000", + "timestamp": "2025-09-05 08:56:59.653697", + "step": 1000, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:56:59.817222", + "step": 1000, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39639365673065186, + "timestamp": "2025-09-05 08:56:59.819075", + "step": 1001, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:56:59.988417", + "step": 1001, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34922510385513306, + "timestamp": "2025-09-05 08:56:59.990486", + "step": 1002, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:00.196565", + "step": 1002, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3727569878101349, + "timestamp": "2025-09-05 08:57:00.198531", + "step": 1003, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:00.366728", + "step": 1003, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4180833697319031, + "timestamp": "2025-09-05 08:57:00.381916", + "step": 1004, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:00.579962", + "step": 1004, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3678615689277649, + "timestamp": "2025-09-05 08:57:00.582147", + "step": 1005, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:00.782046", + "step": 1005, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4152246415615082, + "timestamp": "2025-09-05 08:57:00.783965", + "step": 1006, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:00.983251", + "step": 1006, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25905001163482666, + "timestamp": "2025-09-05 08:57:00.985125", + "step": 1007, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:01.153313", + "step": 1007, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32605719566345215, + "timestamp": "2025-09-05 08:57:01.168446", + "step": 1008, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:01.357148", + "step": 1008, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23667758703231812, + "timestamp": "2025-09-05 08:57:01.359310", + "step": 1009, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:01.556187", + "step": 1009, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32055971026420593, + "timestamp": "2025-09-05 08:57:01.558142", + "step": 1010, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:01.754795", + "step": 1010, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3258388340473175, + "timestamp": "2025-09-05 08:57:01.757186", + "step": 1011, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:01.957019", + "step": 1011, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3455141484737396, + "timestamp": "2025-09-05 08:57:01.971080", + "step": 1012, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:02.159089", + "step": 1012, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33423060178756714, + "timestamp": "2025-09-05 08:57:02.161397", + "step": 1013, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:02.360107", + "step": 1013, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2707538902759552, + "timestamp": "2025-09-05 08:57:02.364072", + "step": 1014, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:02.568044", + "step": 1014, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3885761499404907, + "timestamp": "2025-09-05 08:57:02.570516", + "step": 1015, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:02.768993", + "step": 1015, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26470303535461426, + "timestamp": "2025-09-05 08:57:02.783878", + "step": 1016, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:02.975347", + "step": 1016, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3257010877132416, + "timestamp": "2025-09-05 08:57:02.977993", + "step": 1017, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:03.175106", + "step": 1017, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34169089794158936, + "timestamp": "2025-09-05 08:57:03.176698", + "step": 1018, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:03.375475", + "step": 1018, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3442467749118805, + "timestamp": "2025-09-05 08:57:03.377243", + "step": 1019, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:03.575558", + "step": 1019, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3116152286529541, + "timestamp": "2025-09-05 08:57:03.592015", + "step": 1020, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:08.258421", + "step": 1020, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.72869426129927, + "timestamp": "2025-09-05 08:57:08.260263", + "step": 1020, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:08.421747", + "step": 1020, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40635350346565247, + "timestamp": "2025-09-05 08:57:08.424564", + "step": 1021, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:08.591844", + "step": 1021, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5129305720329285, + "timestamp": "2025-09-05 08:57:08.593715", + "step": 1022, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:08.799002", + "step": 1022, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3052510917186737, + "timestamp": "2025-09-05 08:57:08.800799", + "step": 1023, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:08.997545", + "step": 1023, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29209795594215393, + "timestamp": "2025-09-05 08:57:09.011812", + "step": 1024, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:09.200301", + "step": 1024, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4792327284812927, + "timestamp": "2025-09-05 08:57:09.202863", + "step": 1025, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:09.370832", + "step": 1025, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2132500410079956, + "timestamp": "2025-09-05 08:57:09.373056", + "step": 1026, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:09.569420", + "step": 1026, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4008413553237915, + "timestamp": "2025-09-05 08:57:09.571230", + "step": 1027, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:09.775965", + "step": 1027, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40069666504859924, + "timestamp": "2025-09-05 08:57:09.785979", + "step": 1028, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:09.951269", + "step": 1028, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4267215132713318, + "timestamp": "2025-09-05 08:57:09.953022", + "step": 1029, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:10.158013", + "step": 1029, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38750991225242615, + "timestamp": "2025-09-05 08:57:10.159677", + "step": 1030, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:10.352728", + "step": 1030, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26181745529174805, + "timestamp": "2025-09-05 08:57:10.354524", + "step": 1031, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:10.560112", + "step": 1031, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27084633708000183, + "timestamp": "2025-09-05 08:57:10.576594", + "step": 1032, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:10.772045", + "step": 1032, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29004645347595215, + "timestamp": "2025-09-05 08:57:10.774068", + "step": 1033, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:10.940422", + "step": 1033, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4643933176994324, + "timestamp": "2025-09-05 08:57:10.942597", + "step": 1034, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:57:11.150075", + "step": 1034, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42606353759765625, + "timestamp": "2025-09-05 08:57:11.151790", + "step": 1035, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:11.358130", + "step": 1035, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5070635676383972, + "timestamp": "2025-09-05 08:57:11.375711", + "step": 1036, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:11.572391", + "step": 1036, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3080455958843231, + "timestamp": "2025-09-05 08:57:11.574129", + "step": 1037, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:11.778556", + "step": 1037, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25634804368019104, + "timestamp": "2025-09-05 08:57:11.780617", + "step": 1038, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:57:11.987608", + "step": 1038, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3523538410663605, + "timestamp": "2025-09-05 08:57:11.989435", + "step": 1039, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:12.186193", + "step": 1039, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.353440523147583, + "timestamp": "2025-09-05 08:57:12.195768", + "step": 1040, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:16.862221", + "step": 1040, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.865375266262824, + "timestamp": "2025-09-05 08:57:16.865926", + "step": 1040, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1040", + "timestamp": "2025-09-05 08:57:17.336759", + "step": 1040, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:17.507192", + "step": 1040, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4089427888393402, + "timestamp": "2025-09-05 08:57:17.510045", + "step": 1041, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:17.715791", + "step": 1041, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.35010460019111633, + "timestamp": "2025-09-05 08:57:17.718881", + "step": 1042, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:17.914889", + "step": 1042, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2957332134246826, + "timestamp": "2025-09-05 08:57:17.917349", + "step": 1043, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:18.114054", + "step": 1043, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25546780228614807, + "timestamp": "2025-09-05 08:57:18.128508", + "step": 1044, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:18.329998", + "step": 1044, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29871493577957153, + "timestamp": "2025-09-05 08:57:18.332151", + "step": 1045, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:18.501032", + "step": 1045, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3960125744342804, + "timestamp": "2025-09-05 08:57:18.503962", + "step": 1046, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:18.711095", + "step": 1046, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2644537091255188, + "timestamp": "2025-09-05 08:57:18.713113", + "step": 1047, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:18.910983", + "step": 1047, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30242469906806946, + "timestamp": "2025-09-05 08:57:18.920490", + "step": 1048, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:19.084703", + "step": 1048, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43861666321754456, + "timestamp": "2025-09-05 08:57:19.087028", + "step": 1049, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:19.256016", + "step": 1049, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26393255591392517, + "timestamp": "2025-09-05 08:57:19.258205", + "step": 1050, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:19.464084", + "step": 1050, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29509156942367554, + "timestamp": "2025-09-05 08:57:19.466567", + "step": 1051, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:19.637608", + "step": 1051, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26441511511802673, + "timestamp": "2025-09-05 08:57:19.654587", + "step": 1052, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:19.852491", + "step": 1052, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3607224225997925, + "timestamp": "2025-09-05 08:57:19.854411", + "step": 1053, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:20.026434", + "step": 1053, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41068440675735474, + "timestamp": "2025-09-05 08:57:20.028325", + "step": 1054, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:20.238848", + "step": 1054, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3460855484008789, + "timestamp": "2025-09-05 08:57:20.241070", + "step": 1055, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:20.438914", + "step": 1055, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22883664071559906, + "timestamp": "2025-09-05 08:57:20.455318", + "step": 1056, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:20.652501", + "step": 1056, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3002649247646332, + "timestamp": "2025-09-05 08:57:20.654137", + "step": 1057, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:20.859648", + "step": 1057, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3611856698989868, + "timestamp": "2025-09-05 08:57:20.862321", + "step": 1058, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:21.030886", + "step": 1058, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34930041432380676, + "timestamp": "2025-09-05 08:57:21.033038", + "step": 1059, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:21.241166", + "step": 1059, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2977052628993988, + "timestamp": "2025-09-05 08:57:21.255758", + "step": 1060, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:25.941683", + "step": 1060, + "epoch": 1 + }, + { + "type": "pplx", + "content": 58.08633734713871, + "timestamp": "2025-09-05 08:57:25.943919", + "step": 1060, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:57:26.106655", + "step": 1060, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3529953360557556, + "timestamp": "2025-09-05 08:57:26.109260", + "step": 1061, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:26.276963", + "step": 1061, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25426262617111206, + "timestamp": "2025-09-05 08:57:26.278757", + "step": 1062, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:26.484665", + "step": 1062, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39096131920814514, + "timestamp": "2025-09-05 08:57:26.486460", + "step": 1063, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:26.682998", + "step": 1063, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39098745584487915, + "timestamp": "2025-09-05 08:57:26.697339", + "step": 1064, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:26.885072", + "step": 1064, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39723387360572815, + "timestamp": "2025-09-05 08:57:26.887006", + "step": 1065, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:27.083172", + "step": 1065, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3086792528629303, + "timestamp": "2025-09-05 08:57:27.085530", + "step": 1066, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:27.283705", + "step": 1066, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3772234320640564, + "timestamp": "2025-09-05 08:57:27.285628", + "step": 1067, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:27.483230", + "step": 1067, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44487428665161133, + "timestamp": "2025-09-05 08:57:27.497543", + "step": 1068, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:57:27.686727", + "step": 1068, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2712862193584442, + "timestamp": "2025-09-05 08:57:27.688749", + "step": 1069, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:27.899909", + "step": 1069, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2736116647720337, + "timestamp": "2025-09-05 08:57:27.902023", + "step": 1070, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:28.104397", + "step": 1070, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2640036344528198, + "timestamp": "2025-09-05 08:57:28.106341", + "step": 1071, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:57:28.301834", + "step": 1071, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25708210468292236, + "timestamp": "2025-09-05 08:57:28.316390", + "step": 1072, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:28.502823", + "step": 1072, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4192257523536682, + "timestamp": "2025-09-05 08:57:28.504604", + "step": 1073, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:28.702247", + "step": 1073, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5188899040222168, + "timestamp": "2025-09-05 08:57:28.704344", + "step": 1074, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:28.910313", + "step": 1074, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.47707605361938477, + "timestamp": "2025-09-05 08:57:28.912539", + "step": 1075, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:29.123376", + "step": 1075, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2549794614315033, + "timestamp": "2025-09-05 08:57:29.140734", + "step": 1076, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:29.340155", + "step": 1076, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32116273045539856, + "timestamp": "2025-09-05 08:57:29.342027", + "step": 1077, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:29.537300", + "step": 1077, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2507237195968628, + "timestamp": "2025-09-05 08:57:29.539351", + "step": 1078, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:29.744422", + "step": 1078, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30762243270874023, + "timestamp": "2025-09-05 08:57:29.746343", + "step": 1079, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:29.943339", + "step": 1079, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3563655614852905, + "timestamp": "2025-09-05 08:57:29.960859", + "step": 1080, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:34.630014", + "step": 1080, + "epoch": 1 + }, + { + "type": "pplx", + "content": 57.13828478413463, + "timestamp": "2025-09-05 08:57:34.632225", + "step": 1080, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1080", + "timestamp": "2025-09-05 08:57:35.114220", + "step": 1080, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:35.297356", + "step": 1080, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3543899655342102, + "timestamp": "2025-09-05 08:57:35.299618", + "step": 1081, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:35.505794", + "step": 1081, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37011322379112244, + "timestamp": "2025-09-05 08:57:35.507707", + "step": 1082, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:35.705175", + "step": 1082, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29471492767333984, + "timestamp": "2025-09-05 08:57:35.706975", + "step": 1083, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:35.903105", + "step": 1083, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40031829476356506, + "timestamp": "2025-09-05 08:57:35.917669", + "step": 1084, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:36.107208", + "step": 1084, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25579631328582764, + "timestamp": "2025-09-05 08:57:36.109233", + "step": 1085, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:36.314564", + "step": 1085, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30125442147254944, + "timestamp": "2025-09-05 08:57:36.316525", + "step": 1086, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:36.522862", + "step": 1086, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3710455000400543, + "timestamp": "2025-09-05 08:57:36.525296", + "step": 1087, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:36.722872", + "step": 1087, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38871708512306213, + "timestamp": "2025-09-05 08:57:36.736903", + "step": 1088, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:36.925159", + "step": 1088, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4635475277900696, + "timestamp": "2025-09-05 08:57:36.926826", + "step": 1089, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:57:37.121132", + "step": 1089, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3786855936050415, + "timestamp": "2025-09-05 08:57:37.123066", + "step": 1090, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:37.320185", + "step": 1090, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29588383436203003, + "timestamp": "2025-09-05 08:57:37.321973", + "step": 1091, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:37.516900", + "step": 1091, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2814580500125885, + "timestamp": "2025-09-05 08:57:37.531031", + "step": 1092, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:37.719213", + "step": 1092, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29636815190315247, + "timestamp": "2025-09-05 08:57:37.721048", + "step": 1093, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:57:37.888769", + "step": 1093, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30050742626190186, + "timestamp": "2025-09-05 08:57:37.890910", + "step": 1094, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:38.095014", + "step": 1094, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21762610971927643, + "timestamp": "2025-09-05 08:57:38.096704", + "step": 1095, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:38.293599", + "step": 1095, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36448773741722107, + "timestamp": "2025-09-05 08:57:38.308108", + "step": 1096, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:38.495244", + "step": 1096, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2675822079181671, + "timestamp": "2025-09-05 08:57:38.497086", + "step": 1097, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:38.703191", + "step": 1097, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3438616693019867, + "timestamp": "2025-09-05 08:57:38.705356", + "step": 1098, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:38.901089", + "step": 1098, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25544899702072144, + "timestamp": "2025-09-05 08:57:38.903018", + "step": 1099, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:39.109292", + "step": 1099, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15850801765918732, + "timestamp": "2025-09-05 08:57:39.123875", + "step": 1100, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:43.756122", + "step": 1100, + "epoch": 1 + }, + { + "type": "pplx", + "content": 55.68282147255737, + "timestamp": "2025-09-05 08:57:43.758533", + "step": 1100, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:43.921152", + "step": 1100, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4078519344329834, + "timestamp": "2025-09-05 08:57:43.923717", + "step": 1101, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:44.130660", + "step": 1101, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31081441044807434, + "timestamp": "2025-09-05 08:57:44.132606", + "step": 1102, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:44.340156", + "step": 1102, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.14278367161750793, + "timestamp": "2025-09-05 08:57:44.342528", + "step": 1103, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:44.549416", + "step": 1103, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2088335156440735, + "timestamp": "2025-09-05 08:57:44.563823", + "step": 1104, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:44.760932", + "step": 1104, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3414754569530487, + "timestamp": "2025-09-05 08:57:44.762833", + "step": 1105, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:44.960267", + "step": 1105, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43541449308395386, + "timestamp": "2025-09-05 08:57:44.962499", + "step": 1106, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:45.170538", + "step": 1106, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38098201155662537, + "timestamp": "2025-09-05 08:57:45.173030", + "step": 1107, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 272 + ], + "flops": 5440033091648.0 + }, + "timestamp": "2025-09-05 08:57:45.374443", + "step": 1107, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.49568867683410645, + "timestamp": "2025-09-05 08:57:45.391316", + "step": 1108, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:45.590356", + "step": 1108, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.20845304429531097, + "timestamp": "2025-09-05 08:57:45.592498", + "step": 1109, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:45.789652", + "step": 1109, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34582969546318054, + "timestamp": "2025-09-05 08:57:45.791657", + "step": 1110, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:45.995088", + "step": 1110, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30466410517692566, + "timestamp": "2025-09-05 08:57:45.997298", + "step": 1111, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:46.192699", + "step": 1111, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3194935917854309, + "timestamp": "2025-09-05 08:57:46.209949", + "step": 1112, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:46.408493", + "step": 1112, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3247276246547699, + "timestamp": "2025-09-05 08:57:46.410457", + "step": 1113, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:46.617098", + "step": 1113, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26578959822654724, + "timestamp": "2025-09-05 08:57:46.618843", + "step": 1114, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:46.818714", + "step": 1114, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48078733682632446, + "timestamp": "2025-09-05 08:57:46.820884", + "step": 1115, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:47.016092", + "step": 1115, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3435327410697937, + "timestamp": "2025-09-05 08:57:47.030516", + "step": 1116, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:57:47.216523", + "step": 1116, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3160141408443451, + "timestamp": "2025-09-05 08:57:47.218401", + "step": 1117, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:47.412594", + "step": 1117, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3424045145511627, + "timestamp": "2025-09-05 08:57:47.414440", + "step": 1118, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:47.610879", + "step": 1118, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29838377237319946, + "timestamp": "2025-09-05 08:57:47.612751", + "step": 1119, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:47.808963", + "step": 1119, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2777602970600128, + "timestamp": "2025-09-05 08:57:47.823432", + "step": 1120, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:57:52.452094", + "step": 1120, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.402618878722485, + "timestamp": "2025-09-05 08:57:52.454888", + "step": 1120, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1120", + "timestamp": "2025-09-05 08:57:52.924333", + "step": 1120, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:57:53.084960", + "step": 1120, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2436535656452179, + "timestamp": "2025-09-05 08:57:53.086827", + "step": 1121, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:53.289641", + "step": 1121, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3066195845603943, + "timestamp": "2025-09-05 08:57:53.291300", + "step": 1122, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:53.499294", + "step": 1122, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.210659921169281, + "timestamp": "2025-09-05 08:57:53.501932", + "step": 1123, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:53.701662", + "step": 1123, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42288801074028015, + "timestamp": "2025-09-05 08:57:53.718355", + "step": 1124, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:53.917047", + "step": 1124, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27715227007865906, + "timestamp": "2025-09-05 08:57:53.918967", + "step": 1125, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:54.085170", + "step": 1125, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4282056987285614, + "timestamp": "2025-09-05 08:57:54.087062", + "step": 1126, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:57:54.280878", + "step": 1126, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3095110356807709, + "timestamp": "2025-09-05 08:57:54.283528", + "step": 1127, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:54.489385", + "step": 1127, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3560994267463684, + "timestamp": "2025-09-05 08:57:54.504279", + "step": 1128, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:54.706093", + "step": 1128, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5486235618591309, + "timestamp": "2025-09-05 08:57:54.708108", + "step": 1129, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:54.915209", + "step": 1129, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2991634011268616, + "timestamp": "2025-09-05 08:57:54.917880", + "step": 1130, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:55.116971", + "step": 1130, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.323395311832428, + "timestamp": "2025-09-05 08:57:55.119750", + "step": 1131, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:57:55.315888", + "step": 1131, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2886749505996704, + "timestamp": "2025-09-05 08:57:55.330392", + "step": 1132, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:55.529682", + "step": 1132, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3267287313938141, + "timestamp": "2025-09-05 08:57:55.531485", + "step": 1133, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:55.728151", + "step": 1133, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38941022753715515, + "timestamp": "2025-09-05 08:57:55.730008", + "step": 1134, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:57:55.924848", + "step": 1134, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3658711016178131, + "timestamp": "2025-09-05 08:57:55.926503", + "step": 1135, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:56.093798", + "step": 1135, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34820908308029175, + "timestamp": "2025-09-05 08:57:56.110544", + "step": 1136, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:56.308140", + "step": 1136, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3882652819156647, + "timestamp": "2025-09-05 08:57:56.310266", + "step": 1137, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:57:56.479788", + "step": 1137, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.17920145392417908, + "timestamp": "2025-09-05 08:57:56.482460", + "step": 1138, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:57:56.689791", + "step": 1138, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21023604273796082, + "timestamp": "2025-09-05 08:57:56.692566", + "step": 1139, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:57:56.889514", + "step": 1139, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28918132185935974, + "timestamp": "2025-09-05 08:57:56.904085", + "step": 1140, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:01.541928", + "step": 1140, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.62639246330814, + "timestamp": "2025-09-05 08:58:01.543982", + "step": 1140, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:01.704731", + "step": 1140, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2688441276550293, + "timestamp": "2025-09-05 08:58:01.707270", + "step": 1141, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:01.918528", + "step": 1141, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.14376114308834076, + "timestamp": "2025-09-05 08:58:01.923187", + "step": 1142, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:02.134322", + "step": 1142, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.375431627035141, + "timestamp": "2025-09-05 08:58:02.136938", + "step": 1143, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:02.351335", + "step": 1143, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3484734892845154, + "timestamp": "2025-09-05 08:58:02.371162", + "step": 1144, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:02.572337", + "step": 1144, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3586747646331787, + "timestamp": "2025-09-05 08:58:02.573999", + "step": 1145, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:02.768214", + "step": 1145, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.339052677154541, + "timestamp": "2025-09-05 08:58:02.770566", + "step": 1146, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:02.966732", + "step": 1146, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38937392830848694, + "timestamp": "2025-09-05 08:58:02.969114", + "step": 1147, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:03.175468", + "step": 1147, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5443062782287598, + "timestamp": "2025-09-05 08:58:03.191793", + "step": 1148, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:03.390206", + "step": 1148, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2671515643596649, + "timestamp": "2025-09-05 08:58:03.392071", + "step": 1149, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:03.588804", + "step": 1149, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31323206424713135, + "timestamp": "2025-09-05 08:58:03.591276", + "step": 1150, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:03.788661", + "step": 1150, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4581383764743805, + "timestamp": "2025-09-05 08:58:03.790937", + "step": 1151, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:03.996509", + "step": 1151, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3642024099826813, + "timestamp": "2025-09-05 08:58:04.011304", + "step": 1152, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:04.200325", + "step": 1152, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28851771354675293, + "timestamp": "2025-09-05 08:58:04.202081", + "step": 1153, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:04.398012", + "step": 1153, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.297490656375885, + "timestamp": "2025-09-05 08:58:04.399844", + "step": 1154, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:04.595632", + "step": 1154, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3883640766143799, + "timestamp": "2025-09-05 08:58:04.597484", + "step": 1155, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:04.793403", + "step": 1155, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4353066086769104, + "timestamp": "2025-09-05 08:58:04.809872", + "step": 1156, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:05.007788", + "step": 1156, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41394755244255066, + "timestamp": "2025-09-05 08:58:05.010408", + "step": 1157, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:05.210965", + "step": 1157, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4061930477619171, + "timestamp": "2025-09-05 08:58:05.213045", + "step": 1158, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:05.379194", + "step": 1158, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39679089188575745, + "timestamp": "2025-09-05 08:58:05.382513", + "step": 1159, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:05.589595", + "step": 1159, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26510128378868103, + "timestamp": "2025-09-05 08:58:05.604655", + "step": 1160, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:10.260868", + "step": 1160, + "epoch": 1 + }, + { + "type": "pplx", + "content": 55.69607325889674, + "timestamp": "2025-09-05 08:58:10.262863", + "step": 1160, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1160", + "timestamp": "2025-09-05 08:58:10.713992", + "step": 1160, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:10.883875", + "step": 1160, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2949378490447998, + "timestamp": "2025-09-05 08:58:10.885977", + "step": 1161, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:11.082355", + "step": 1161, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.481171578168869, + "timestamp": "2025-09-05 08:58:11.083958", + "step": 1162, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:11.280145", + "step": 1162, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3129526972770691, + "timestamp": "2025-09-05 08:58:11.281934", + "step": 1163, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:11.476684", + "step": 1163, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3266424238681793, + "timestamp": "2025-09-05 08:58:11.491572", + "step": 1164, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:11.679597", + "step": 1164, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.23583292961120605, + "timestamp": "2025-09-05 08:58:11.681817", + "step": 1165, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:11.882166", + "step": 1165, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5377989411354065, + "timestamp": "2025-09-05 08:58:11.885749", + "step": 1166, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:12.056875", + "step": 1166, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3818263113498688, + "timestamp": "2025-09-05 08:58:12.059180", + "step": 1167, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:12.265640", + "step": 1167, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.41938114166259766, + "timestamp": "2025-09-05 08:58:12.277483", + "step": 1168, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:12.447993", + "step": 1168, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.17846006155014038, + "timestamp": "2025-09-05 08:58:12.449643", + "step": 1169, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:12.656563", + "step": 1169, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3130786418914795, + "timestamp": "2025-09-05 08:58:12.658864", + "step": 1170, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:12.856448", + "step": 1170, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24914269149303436, + "timestamp": "2025-09-05 08:58:12.858087", + "step": 1171, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:13.053886", + "step": 1171, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3340952694416046, + "timestamp": "2025-09-05 08:58:13.068458", + "step": 1172, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:13.258558", + "step": 1172, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3117923438549042, + "timestamp": "2025-09-05 08:58:13.260298", + "step": 1173, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:13.428124", + "step": 1173, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36320504546165466, + "timestamp": "2025-09-05 08:58:13.430056", + "step": 1174, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:13.597395", + "step": 1174, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3952508866786957, + "timestamp": "2025-09-05 08:58:13.599309", + "step": 1175, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:13.809227", + "step": 1175, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3737662136554718, + "timestamp": "2025-09-05 08:58:13.823689", + "step": 1176, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:14.012685", + "step": 1176, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4855545461177826, + "timestamp": "2025-09-05 08:58:14.014560", + "step": 1177, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:14.210707", + "step": 1177, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.489812970161438, + "timestamp": "2025-09-05 08:58:14.212582", + "step": 1178, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:14.418706", + "step": 1178, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3980942666530609, + "timestamp": "2025-09-05 08:58:14.423470", + "step": 1179, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:14.629308", + "step": 1179, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4134223163127899, + "timestamp": "2025-09-05 08:58:14.645650", + "step": 1180, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:19.351980", + "step": 1180, + "epoch": 1 + }, + { + "type": "pplx", + "content": 54.78913997248039, + "timestamp": "2025-09-05 08:58:19.355359", + "step": 1180, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:19.524235", + "step": 1180, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3536244332790375, + "timestamp": "2025-09-05 08:58:19.526582", + "step": 1181, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:19.725722", + "step": 1181, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2990207076072693, + "timestamp": "2025-09-05 08:58:19.729850", + "step": 1182, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:19.933714", + "step": 1182, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2687399983406067, + "timestamp": "2025-09-05 08:58:19.935632", + "step": 1183, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:58:20.139334", + "step": 1183, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3062213957309723, + "timestamp": "2025-09-05 08:58:20.156178", + "step": 1184, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:20.353342", + "step": 1184, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3144480586051941, + "timestamp": "2025-09-05 08:58:20.355192", + "step": 1185, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:20.552429", + "step": 1185, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.34665945172309875, + "timestamp": "2025-09-05 08:58:20.554396", + "step": 1186, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:20.762922", + "step": 1186, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2913441061973572, + "timestamp": "2025-09-05 08:58:20.764934", + "step": 1187, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:20.961203", + "step": 1187, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27978482842445374, + "timestamp": "2025-09-05 08:58:20.977949", + "step": 1188, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:21.176178", + "step": 1188, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3447984755039215, + "timestamp": "2025-09-05 08:58:21.178182", + "step": 1189, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:21.385149", + "step": 1189, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27464714646339417, + "timestamp": "2025-09-05 08:58:21.387222", + "step": 1190, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:21.596698", + "step": 1190, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31913846731185913, + "timestamp": "2025-09-05 08:58:21.600384", + "step": 1191, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:21.804357", + "step": 1191, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3542693257331848, + "timestamp": "2025-09-05 08:58:21.818909", + "step": 1192, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:22.006890", + "step": 1192, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36727648973464966, + "timestamp": "2025-09-05 08:58:22.009547", + "step": 1193, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:22.203981", + "step": 1193, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5776286125183105, + "timestamp": "2025-09-05 08:58:22.206814", + "step": 1194, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:58:22.400605", + "step": 1194, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2967214286327362, + "timestamp": "2025-09-05 08:58:22.403373", + "step": 1195, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:22.596778", + "step": 1195, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3207745850086212, + "timestamp": "2025-09-05 08:58:22.612062", + "step": 1196, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:22.807056", + "step": 1196, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.299358606338501, + "timestamp": "2025-09-05 08:58:22.809849", + "step": 1197, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:23.006195", + "step": 1197, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3517630994319916, + "timestamp": "2025-09-05 08:58:23.008969", + "step": 1198, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:23.205294", + "step": 1198, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3210662305355072, + "timestamp": "2025-09-05 08:58:23.208167", + "step": 1199, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:23.402800", + "step": 1199, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2987304925918579, + "timestamp": "2025-09-05 08:58:23.420736", + "step": 1200, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:28.151648", + "step": 1200, + "epoch": 1 + }, + { + "type": "pplx", + "content": 55.163801136680675, + "timestamp": "2025-09-05 08:58:28.153281", + "step": 1200, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1200", + "timestamp": "2025-09-05 08:58:28.623598", + "step": 1200, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:28.796735", + "step": 1200, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.37314802408218384, + "timestamp": "2025-09-05 08:58:28.800241", + "step": 1201, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:29.007449", + "step": 1201, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46742531657218933, + "timestamp": "2025-09-05 08:58:29.009503", + "step": 1202, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:29.221368", + "step": 1202, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3984720706939697, + "timestamp": "2025-09-05 08:58:29.223451", + "step": 1203, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:29.435123", + "step": 1203, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.32529622316360474, + "timestamp": "2025-09-05 08:58:29.449351", + "step": 1204, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:29.648833", + "step": 1204, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5382220149040222, + "timestamp": "2025-09-05 08:58:29.650921", + "step": 1205, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:29.849552", + "step": 1205, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38562560081481934, + "timestamp": "2025-09-05 08:58:29.851589", + "step": 1206, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:30.060179", + "step": 1206, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2525237798690796, + "timestamp": "2025-09-05 08:58:30.062452", + "step": 1207, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:30.261557", + "step": 1207, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3691144287586212, + "timestamp": "2025-09-05 08:58:30.279697", + "step": 1208, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:30.480112", + "step": 1208, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28269970417022705, + "timestamp": "2025-09-05 08:58:30.483548", + "step": 1209, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:30.688759", + "step": 1209, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3319048583507538, + "timestamp": "2025-09-05 08:58:30.691126", + "step": 1210, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:30.888760", + "step": 1210, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3252932131290436, + "timestamp": "2025-09-05 08:58:30.890442", + "step": 1211, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:31.098575", + "step": 1211, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4231226444244385, + "timestamp": "2025-09-05 08:58:31.115403", + "step": 1212, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:31.305184", + "step": 1212, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28163203597068787, + "timestamp": "2025-09-05 08:58:31.308680", + "step": 1213, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:31.515231", + "step": 1213, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3782075345516205, + "timestamp": "2025-09-05 08:58:31.517092", + "step": 1214, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:31.714360", + "step": 1214, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30757156014442444, + "timestamp": "2025-09-05 08:58:31.716274", + "step": 1215, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:31.912180", + "step": 1215, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46240562200546265, + "timestamp": "2025-09-05 08:58:31.926460", + "step": 1216, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:32.116240", + "step": 1216, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.275336891412735, + "timestamp": "2025-09-05 08:58:32.118015", + "step": 1217, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:32.314740", + "step": 1217, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43169447779655457, + "timestamp": "2025-09-05 08:58:32.316516", + "step": 1218, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:32.523346", + "step": 1218, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4832044541835785, + "timestamp": "2025-09-05 08:58:32.525237", + "step": 1219, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:32.726761", + "step": 1219, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3741658627986908, + "timestamp": "2025-09-05 08:58:32.743245", + "step": 1220, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:37.465340", + "step": 1220, + "epoch": 1 + }, + { + "type": "pplx", + "content": 55.636826007322504, + "timestamp": "2025-09-05 08:58:37.467376", + "step": 1220, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:37.629395", + "step": 1220, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2922039031982422, + "timestamp": "2025-09-05 08:58:37.633868", + "step": 1221, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:37.801769", + "step": 1221, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.31432512402534485, + "timestamp": "2025-09-05 08:58:37.806881", + "step": 1222, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:38.020020", + "step": 1222, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.273971289396286, + "timestamp": "2025-09-05 08:58:38.021964", + "step": 1223, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:38.217738", + "step": 1223, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2825872302055359, + "timestamp": "2025-09-05 08:58:38.234390", + "step": 1224, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:38.430446", + "step": 1224, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3784426152706146, + "timestamp": "2025-09-05 08:58:38.432294", + "step": 1225, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:38.638082", + "step": 1225, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.36815857887268066, + "timestamp": "2025-09-05 08:58:38.639863", + "step": 1226, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:38.846971", + "step": 1226, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3271258771419525, + "timestamp": "2025-09-05 08:58:38.848960", + "step": 1227, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 08:58:39.045679", + "step": 1227, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2493131011724472, + "timestamp": "2025-09-05 08:58:39.060206", + "step": 1228, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:39.255473", + "step": 1228, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2988832890987396, + "timestamp": "2025-09-05 08:58:39.257757", + "step": 1229, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:39.472160", + "step": 1229, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44903451204299927, + "timestamp": "2025-09-05 08:58:39.473964", + "step": 1230, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:39.671824", + "step": 1230, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5419812798500061, + "timestamp": "2025-09-05 08:58:39.674087", + "step": 1231, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:39.882692", + "step": 1231, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3951758146286011, + "timestamp": "2025-09-05 08:58:39.897357", + "step": 1232, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:40.086411", + "step": 1232, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24042554199695587, + "timestamp": "2025-09-05 08:58:40.088327", + "step": 1233, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:40.294687", + "step": 1233, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4239327013492584, + "timestamp": "2025-09-05 08:58:40.296994", + "step": 1234, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:40.497105", + "step": 1234, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19099119305610657, + "timestamp": "2025-09-05 08:58:40.499457", + "step": 1235, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:40.706219", + "step": 1235, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.30486881732940674, + "timestamp": "2025-09-05 08:58:40.722373", + "step": 1236, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:40.920116", + "step": 1236, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24796439707279205, + "timestamp": "2025-09-05 08:58:40.921901", + "step": 1237, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:41.118829", + "step": 1237, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2669127881526947, + "timestamp": "2025-09-05 08:58:41.120812", + "step": 1238, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:41.327218", + "step": 1238, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.20141269266605377, + "timestamp": "2025-09-05 08:58:41.329910", + "step": 1239, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:41.530544", + "step": 1239, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4013859033584595, + "timestamp": "2025-09-05 08:58:41.547398", + "step": 1240, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:46.337252", + "step": 1240, + "epoch": 1 + }, + { + "type": "pplx", + "content": 55.81478220379367, + "timestamp": "2025-09-05 08:58:46.339141", + "step": 1240, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1240", + "timestamp": "2025-09-05 08:58:47.010044", + "step": 1240, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:47.179545", + "step": 1240, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3496423661708832, + "timestamp": "2025-09-05 08:58:47.181523", + "step": 1241, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:47.391265", + "step": 1241, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.17773263156414032, + "timestamp": "2025-09-05 08:58:47.393443", + "step": 1242, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:47.591587", + "step": 1242, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3922744393348694, + "timestamp": "2025-09-05 08:58:47.593807", + "step": 1243, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:47.792392", + "step": 1243, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.39606019854545593, + "timestamp": "2025-09-05 08:58:47.808993", + "step": 1244, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:48.005991", + "step": 1244, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28507742285728455, + "timestamp": "2025-09-05 08:58:48.007751", + "step": 1245, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:48.208527", + "step": 1245, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38115379214286804, + "timestamp": "2025-09-05 08:58:48.210544", + "step": 1246, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:48.420393", + "step": 1246, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38982245326042175, + "timestamp": "2025-09-05 08:58:48.423008", + "step": 1247, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:48.621835", + "step": 1247, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3070641756057739, + "timestamp": "2025-09-05 08:58:48.639605", + "step": 1248, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:48.835488", + "step": 1248, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.29290473461151123, + "timestamp": "2025-09-05 08:58:48.837083", + "step": 1249, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:49.044009", + "step": 1249, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33328184485435486, + "timestamp": "2025-09-05 08:58:49.045921", + "step": 1250, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:49.243517", + "step": 1250, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.482103168964386, + "timestamp": "2025-09-05 08:58:49.245731", + "step": 1251, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:49.451807", + "step": 1251, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4451736807823181, + "timestamp": "2025-09-05 08:58:49.465976", + "step": 1252, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:49.664436", + "step": 1252, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2903027832508087, + "timestamp": "2025-09-05 08:58:49.666700", + "step": 1253, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:49.873073", + "step": 1253, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2942162752151489, + "timestamp": "2025-09-05 08:58:49.875001", + "step": 1254, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:50.072634", + "step": 1254, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4005998969078064, + "timestamp": "2025-09-05 08:58:50.074375", + "step": 1255, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:50.273482", + "step": 1255, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3666303753852844, + "timestamp": "2025-09-05 08:58:50.290366", + "step": 1256, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:50.486596", + "step": 1256, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44452735781669617, + "timestamp": "2025-09-05 08:58:50.488547", + "step": 1257, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:58:50.689502", + "step": 1257, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3432859480381012, + "timestamp": "2025-09-05 08:58:50.691337", + "step": 1258, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:50.895348", + "step": 1258, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.44465234875679016, + "timestamp": "2025-09-05 08:58:50.899575", + "step": 1259, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:51.107777", + "step": 1259, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43338504433631897, + "timestamp": "2025-09-05 08:58:51.124714", + "step": 1260, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:58:55.873912", + "step": 1260, + "epoch": 1 + }, + { + "type": "pplx", + "content": 56.86786214445365, + "timestamp": "2025-09-05 08:58:55.877053", + "step": 1260, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:56.038358", + "step": 1260, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2701398730278015, + "timestamp": "2025-09-05 08:58:56.040599", + "step": 1261, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:56.246168", + "step": 1261, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.319100558757782, + "timestamp": "2025-09-05 08:58:56.248207", + "step": 1262, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:56.445175", + "step": 1262, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3955155313014984, + "timestamp": "2025-09-05 08:58:56.447043", + "step": 1263, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:56.653005", + "step": 1263, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28663066029548645, + "timestamp": "2025-09-05 08:58:56.672510", + "step": 1264, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:56.882587", + "step": 1264, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2910723090171814, + "timestamp": "2025-09-05 08:58:56.884373", + "step": 1265, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:57.081041", + "step": 1265, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.48940280079841614, + "timestamp": "2025-09-05 08:58:57.083007", + "step": 1266, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:57.278680", + "step": 1266, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25540128350257874, + "timestamp": "2025-09-05 08:58:57.280828", + "step": 1267, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:57.478468", + "step": 1267, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4109537899494171, + "timestamp": "2025-09-05 08:58:57.492922", + "step": 1268, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:57.682652", + "step": 1268, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3179936408996582, + "timestamp": "2025-09-05 08:58:57.685638", + "step": 1269, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:57.893860", + "step": 1269, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33500999212265015, + "timestamp": "2025-09-05 08:58:57.896864", + "step": 1270, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:58.105312", + "step": 1270, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.316933810710907, + "timestamp": "2025-09-05 08:58:58.107330", + "step": 1271, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:58:58.303861", + "step": 1271, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3351965844631195, + "timestamp": "2025-09-05 08:58:58.326530", + "step": 1272, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:58.515213", + "step": 1272, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.42428067326545715, + "timestamp": "2025-09-05 08:58:58.517405", + "step": 1273, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:58.713550", + "step": 1273, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.24712416529655457, + "timestamp": "2025-09-05 08:58:58.716283", + "step": 1274, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:58:58.916484", + "step": 1274, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25089097023010254, + "timestamp": "2025-09-05 08:58:58.921477", + "step": 1275, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:58:59.132790", + "step": 1275, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3202970325946808, + "timestamp": "2025-09-05 08:58:59.148507", + "step": 1276, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:58:59.342869", + "step": 1276, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27855849266052246, + "timestamp": "2025-09-05 08:58:59.344795", + "step": 1277, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:58:59.543276", + "step": 1277, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.40407755970954895, + "timestamp": "2025-09-05 08:58:59.545205", + "step": 1278, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:59.749926", + "step": 1278, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1615646630525589, + "timestamp": "2025-09-05 08:58:59.759180", + "step": 1279, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:58:59.964874", + "step": 1279, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3803251385688782, + "timestamp": "2025-09-05 08:58:59.981869", + "step": 1280, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:04.747412", + "step": 1280, + "epoch": 1 + }, + { + "type": "pplx", + "content": 57.685687212488666, + "timestamp": "2025-09-05 08:59:04.749354", + "step": 1280, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1280", + "timestamp": "2025-09-05 08:59:05.213901", + "step": 1280, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:05.376627", + "step": 1280, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3631496727466583, + "timestamp": "2025-09-05 08:59:05.378976", + "step": 1281, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:05.579929", + "step": 1281, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3576178252696991, + "timestamp": "2025-09-05 08:59:05.581880", + "step": 1282, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:05.778238", + "step": 1282, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.33735018968582153, + "timestamp": "2025-09-05 08:59:05.781707", + "step": 1283, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:05.980579", + "step": 1283, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4075542986392975, + "timestamp": "2025-09-05 08:59:05.997002", + "step": 1284, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:06.192702", + "step": 1284, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2630583643913269, + "timestamp": "2025-09-05 08:59:06.194210", + "step": 1285, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:59:06.400762", + "step": 1285, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.46530115604400635, + "timestamp": "2025-09-05 08:59:06.402542", + "step": 1286, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:06.608888", + "step": 1286, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3795335292816162, + "timestamp": "2025-09-05 08:59:06.610712", + "step": 1287, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 08:59:06.828383", + "step": 1287, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.43324583768844604, + "timestamp": "2025-09-05 08:59:06.842845", + "step": 1288, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:07.031687", + "step": 1288, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2171895056962967, + "timestamp": "2025-09-05 08:59:07.033567", + "step": 1289, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:07.229948", + "step": 1289, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.25000834465026855, + "timestamp": "2025-09-05 08:59:07.231777", + "step": 1290, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:07.427444", + "step": 1290, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2618256211280823, + "timestamp": "2025-09-05 08:59:07.429500", + "step": 1291, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:07.635328", + "step": 1291, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.395792156457901, + "timestamp": "2025-09-05 08:59:07.652426", + "step": 1292, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:07.849809", + "step": 1292, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3665156066417694, + "timestamp": "2025-09-05 08:59:07.852069", + "step": 1293, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:08.048771", + "step": 1293, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.38869205117225647, + "timestamp": "2025-09-05 08:59:08.050665", + "step": 1294, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:08.217638", + "step": 1294, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27352771162986755, + "timestamp": "2025-09-05 08:59:08.222770", + "step": 1295, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:08.441962", + "step": 1295, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3475234806537628, + "timestamp": "2025-09-05 08:59:08.456264", + "step": 1296, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:08.643690", + "step": 1296, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.4448797106742859, + "timestamp": "2025-09-05 08:59:08.645368", + "step": 1297, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:08.839667", + "step": 1297, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.27712783217430115, + "timestamp": "2025-09-05 08:59:08.841451", + "step": 1298, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:09.046283", + "step": 1298, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2694713771343231, + "timestamp": "2025-09-05 08:59:09.048220", + "step": 1299, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:09.242540", + "step": 1299, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.194560706615448, + "timestamp": "2025-09-05 08:59:09.257384", + "step": 1300, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:13.911796", + "step": 1300, + "epoch": 1 + }, + { + "type": "pplx", + "content": 57.60386166296047, + "timestamp": "2025-09-05 08:59:13.915390", + "step": 1300, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:14.080096", + "step": 1300, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3144618272781372, + "timestamp": "2025-09-05 08:59:14.082040", + "step": 1301, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:14.252081", + "step": 1301, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3946076035499573, + "timestamp": "2025-09-05 08:59:14.254065", + "step": 1302, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:14.458505", + "step": 1302, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2931336760520935, + "timestamp": "2025-09-05 08:59:14.460293", + "step": 1303, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:14.667262", + "step": 1303, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4380492568016052, + "timestamp": "2025-09-05 08:59:14.684802", + "step": 1304, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:14.883125", + "step": 1304, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26544374227523804, + "timestamp": "2025-09-05 08:59:14.885058", + "step": 1305, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:15.090269", + "step": 1305, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.363553911447525, + "timestamp": "2025-09-05 08:59:15.091915", + "step": 1306, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:15.287843", + "step": 1306, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3278323709964752, + "timestamp": "2025-09-05 08:59:15.289648", + "step": 1307, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:15.495949", + "step": 1307, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.49476808309555054, + "timestamp": "2025-09-05 08:59:15.505766", + "step": 1308, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:15.669479", + "step": 1308, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4298350214958191, + "timestamp": "2025-09-05 08:59:15.671595", + "step": 1309, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:15.876381", + "step": 1309, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41758593916893005, + "timestamp": "2025-09-05 08:59:15.878324", + "step": 1310, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:16.074306", + "step": 1310, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43739357590675354, + "timestamp": "2025-09-05 08:59:16.076299", + "step": 1311, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:16.282501", + "step": 1311, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44980770349502563, + "timestamp": "2025-09-05 08:59:16.300777", + "step": 1312, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:16.492159", + "step": 1312, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3103547692298889, + "timestamp": "2025-09-05 08:59:16.494221", + "step": 1313, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:16.701249", + "step": 1313, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42775505781173706, + "timestamp": "2025-09-05 08:59:16.703236", + "step": 1314, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:16.910343", + "step": 1314, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2993074059486389, + "timestamp": "2025-09-05 08:59:16.913368", + "step": 1315, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:17.108812", + "step": 1315, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3599098324775696, + "timestamp": "2025-09-05 08:59:17.123094", + "step": 1316, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:17.312042", + "step": 1316, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3977486491203308, + "timestamp": "2025-09-05 08:59:17.313889", + "step": 1317, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:17.510084", + "step": 1317, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3357473611831665, + "timestamp": "2025-09-05 08:59:17.512160", + "step": 1318, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:17.709325", + "step": 1318, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.309467077255249, + "timestamp": "2025-09-05 08:59:17.711246", + "step": 1319, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:17.908970", + "step": 1319, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29499152302742004, + "timestamp": "2025-09-05 08:59:17.923049", + "step": 1320, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:22.624294", + "step": 1320, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.36183768004392, + "timestamp": "2025-09-05 08:59:22.626617", + "step": 1320, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1320", + "timestamp": "2025-09-05 08:59:23.087989", + "step": 1320, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:23.269735", + "step": 1320, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44477683305740356, + "timestamp": "2025-09-05 08:59:23.272494", + "step": 1321, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:23.440651", + "step": 1321, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46572038531303406, + "timestamp": "2025-09-05 08:59:23.443718", + "step": 1322, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:23.650943", + "step": 1322, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27906832098960876, + "timestamp": "2025-09-05 08:59:23.653252", + "step": 1323, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:23.865736", + "step": 1323, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4312252104282379, + "timestamp": "2025-09-05 08:59:23.880583", + "step": 1324, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:24.071014", + "step": 1324, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30487996339797974, + "timestamp": "2025-09-05 08:59:24.074126", + "step": 1325, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:24.244295", + "step": 1325, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3292028307914734, + "timestamp": "2025-09-05 08:59:24.247257", + "step": 1326, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:24.452913", + "step": 1326, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3268643915653229, + "timestamp": "2025-09-05 08:59:24.455149", + "step": 1327, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:24.651837", + "step": 1327, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39009836316108704, + "timestamp": "2025-09-05 08:59:24.666828", + "step": 1328, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:24.856475", + "step": 1328, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33519843220710754, + "timestamp": "2025-09-05 08:59:24.858850", + "step": 1329, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:25.063589", + "step": 1329, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28642693161964417, + "timestamp": "2025-09-05 08:59:25.065551", + "step": 1330, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:25.278910", + "step": 1330, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44241082668304443, + "timestamp": "2025-09-05 08:59:25.280946", + "step": 1331, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:25.479903", + "step": 1331, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33670666813850403, + "timestamp": "2025-09-05 08:59:25.494610", + "step": 1332, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:25.683317", + "step": 1332, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2926885485649109, + "timestamp": "2025-09-05 08:59:25.685308", + "step": 1333, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:25.891586", + "step": 1333, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3921295404434204, + "timestamp": "2025-09-05 08:59:25.894116", + "step": 1334, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:26.100092", + "step": 1334, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37950482964515686, + "timestamp": "2025-09-05 08:59:26.102003", + "step": 1335, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:26.299905", + "step": 1335, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4223680794239044, + "timestamp": "2025-09-05 08:59:26.314409", + "step": 1336, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:26.503230", + "step": 1336, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4278903603553772, + "timestamp": "2025-09-05 08:59:26.505314", + "step": 1337, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:26.712369", + "step": 1337, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3812592923641205, + "timestamp": "2025-09-05 08:59:26.714473", + "step": 1338, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:26.921242", + "step": 1338, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3434469997882843, + "timestamp": "2025-09-05 08:59:26.925060", + "step": 1339, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:27.124683", + "step": 1339, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2850334346294403, + "timestamp": "2025-09-05 08:59:27.139548", + "step": 1340, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:31.890740", + "step": 1340, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.91573223377922, + "timestamp": "2025-09-05 08:59:31.893487", + "step": 1340, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:32.055369", + "step": 1340, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3462655246257782, + "timestamp": "2025-09-05 08:59:32.057646", + "step": 1341, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:32.262517", + "step": 1341, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.346880704164505, + "timestamp": "2025-09-05 08:59:32.264638", + "step": 1342, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:32.463118", + "step": 1342, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3966115415096283, + "timestamp": "2025-09-05 08:59:32.465165", + "step": 1343, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:32.671225", + "step": 1343, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33604347705841064, + "timestamp": "2025-09-05 08:59:32.688689", + "step": 1344, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:32.885583", + "step": 1344, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40588828921318054, + "timestamp": "2025-09-05 08:59:32.887860", + "step": 1345, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:33.085720", + "step": 1345, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24824795126914978, + "timestamp": "2025-09-05 08:59:33.089596", + "step": 1346, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:33.306741", + "step": 1346, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37973007559776306, + "timestamp": "2025-09-05 08:59:33.309131", + "step": 1347, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:33.517134", + "step": 1347, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46875327825546265, + "timestamp": "2025-09-05 08:59:33.534015", + "step": 1348, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:33.737292", + "step": 1348, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.315308541059494, + "timestamp": "2025-09-05 08:59:33.739605", + "step": 1349, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:33.939497", + "step": 1349, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40450769662857056, + "timestamp": "2025-09-05 08:59:33.941817", + "step": 1350, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:34.148841", + "step": 1350, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33537474274635315, + "timestamp": "2025-09-05 08:59:34.151279", + "step": 1351, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:34.353877", + "step": 1351, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4013102948665619, + "timestamp": "2025-09-05 08:59:34.373704", + "step": 1352, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:34.571837", + "step": 1352, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37566959857940674, + "timestamp": "2025-09-05 08:59:34.574167", + "step": 1353, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:34.772021", + "step": 1353, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2328030914068222, + "timestamp": "2025-09-05 08:59:34.774460", + "step": 1354, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:34.982169", + "step": 1354, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40998750925064087, + "timestamp": "2025-09-05 08:59:34.984549", + "step": 1355, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:35.198027", + "step": 1355, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3128526210784912, + "timestamp": "2025-09-05 08:59:35.214422", + "step": 1356, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:35.405238", + "step": 1356, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26632222533226013, + "timestamp": "2025-09-05 08:59:35.407784", + "step": 1357, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:35.615592", + "step": 1357, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4280310273170471, + "timestamp": "2025-09-05 08:59:35.617878", + "step": 1358, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:35.831407", + "step": 1358, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3956908881664276, + "timestamp": "2025-09-05 08:59:35.833787", + "step": 1359, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:36.041582", + "step": 1359, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3153320550918579, + "timestamp": "2025-09-05 08:59:36.056665", + "step": 1360, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:40.819925", + "step": 1360, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.70963319815915, + "timestamp": "2025-09-05 08:59:40.823587", + "step": 1360, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1360", + "timestamp": "2025-09-05 08:59:41.503796", + "step": 1360, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:41.682641", + "step": 1360, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28595757484436035, + "timestamp": "2025-09-05 08:59:41.685944", + "step": 1361, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:41.882882", + "step": 1361, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24493901431560516, + "timestamp": "2025-09-05 08:59:41.885326", + "step": 1362, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:42.085395", + "step": 1362, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5457180142402649, + "timestamp": "2025-09-05 08:59:42.088843", + "step": 1363, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:42.297550", + "step": 1363, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4040033221244812, + "timestamp": "2025-09-05 08:59:42.311680", + "step": 1364, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:42.500508", + "step": 1364, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45766469836235046, + "timestamp": "2025-09-05 08:59:42.504774", + "step": 1365, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:42.715436", + "step": 1365, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3269166648387909, + "timestamp": "2025-09-05 08:59:42.717414", + "step": 1366, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:42.928316", + "step": 1366, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4435346722602844, + "timestamp": "2025-09-05 08:59:42.930765", + "step": 1367, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:43.139410", + "step": 1367, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30959320068359375, + "timestamp": "2025-09-05 08:59:43.153945", + "step": 1368, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:43.344550", + "step": 1368, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3181923031806946, + "timestamp": "2025-09-05 08:59:43.348085", + "step": 1369, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:43.550510", + "step": 1369, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3809313476085663, + "timestamp": "2025-09-05 08:59:43.554965", + "step": 1370, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:43.764296", + "step": 1370, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3070775270462036, + "timestamp": "2025-09-05 08:59:43.765849", + "step": 1371, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:43.970865", + "step": 1371, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3441022038459778, + "timestamp": "2025-09-05 08:59:43.987786", + "step": 1372, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:44.196406", + "step": 1372, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.48999282717704773, + "timestamp": "2025-09-05 08:59:44.199384", + "step": 1373, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:59:44.400537", + "step": 1373, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3837975561618805, + "timestamp": "2025-09-05 08:59:44.402764", + "step": 1374, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:44.602234", + "step": 1374, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29959526658058167, + "timestamp": "2025-09-05 08:59:44.606416", + "step": 1375, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:44.820077", + "step": 1375, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3829365372657776, + "timestamp": "2025-09-05 08:59:44.836407", + "step": 1376, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:45.029202", + "step": 1376, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3905230164527893, + "timestamp": "2025-09-05 08:59:45.031717", + "step": 1377, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:45.228189", + "step": 1377, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2604524493217468, + "timestamp": "2025-09-05 08:59:45.230920", + "step": 1378, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:45.401870", + "step": 1378, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2648540437221527, + "timestamp": "2025-09-05 08:59:45.403891", + "step": 1379, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:45.610398", + "step": 1379, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25715941190719604, + "timestamp": "2025-09-05 08:59:45.625180", + "step": 1380, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 08:59:50.612485", + "step": 1380, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.62276512044233, + "timestamp": "2025-09-05 08:59:50.616571", + "step": 1380, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:50.791068", + "step": 1380, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39979779720306396, + "timestamp": "2025-09-05 08:59:50.792966", + "step": 1381, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:50.997964", + "step": 1381, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3293604254722595, + "timestamp": "2025-09-05 08:59:51.001515", + "step": 1382, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 08:59:51.214343", + "step": 1382, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4426141679286957, + "timestamp": "2025-09-05 08:59:51.216786", + "step": 1383, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:51.477127", + "step": 1383, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4094974994659424, + "timestamp": "2025-09-05 08:59:51.492213", + "step": 1384, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:51.692197", + "step": 1384, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3006475865840912, + "timestamp": "2025-09-05 08:59:51.694616", + "step": 1385, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:51.893460", + "step": 1385, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3527871370315552, + "timestamp": "2025-09-05 08:59:51.895681", + "step": 1386, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:52.108314", + "step": 1386, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4060435891151428, + "timestamp": "2025-09-05 08:59:52.113186", + "step": 1387, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:59:52.315169", + "step": 1387, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31812533736228943, + "timestamp": "2025-09-05 08:59:52.329965", + "step": 1388, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 08:59:52.523118", + "step": 1388, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4235638976097107, + "timestamp": "2025-09-05 08:59:52.525559", + "step": 1389, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:52.765252", + "step": 1389, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3670898377895355, + "timestamp": "2025-09-05 08:59:52.768592", + "step": 1390, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:52.967645", + "step": 1390, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3166564404964447, + "timestamp": "2025-09-05 08:59:52.969921", + "step": 1391, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:53.171302", + "step": 1391, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3685086965560913, + "timestamp": "2025-09-05 08:59:53.189842", + "step": 1392, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:53.392408", + "step": 1392, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24257375299930573, + "timestamp": "2025-09-05 08:59:53.394129", + "step": 1393, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:53.602965", + "step": 1393, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4437284767627716, + "timestamp": "2025-09-05 08:59:53.605553", + "step": 1394, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 08:59:53.885427", + "step": 1394, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44088515639305115, + "timestamp": "2025-09-05 08:59:53.888345", + "step": 1395, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 08:59:54.104505", + "step": 1395, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47625863552093506, + "timestamp": "2025-09-05 08:59:54.123048", + "step": 1396, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 08:59:54.327570", + "step": 1396, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46532103419303894, + "timestamp": "2025-09-05 08:59:54.330291", + "step": 1397, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:54.532148", + "step": 1397, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32851770520210266, + "timestamp": "2025-09-05 08:59:54.534121", + "step": 1398, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:54.827163", + "step": 1398, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5156394243240356, + "timestamp": "2025-09-05 08:59:54.829390", + "step": 1399, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 08:59:55.038664", + "step": 1399, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33277690410614014, + "timestamp": "2025-09-05 08:59:55.054002", + "step": 1400, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:00.092698", + "step": 1400, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.81417784437169, + "timestamp": "2025-09-05 09:00:00.095367", + "step": 1400, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1400", + "timestamp": "2025-09-05 09:00:00.596593", + "step": 1400, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:00.791451", + "step": 1400, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25804686546325684, + "timestamp": "2025-09-05 09:00:00.794003", + "step": 1401, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:00:01.002420", + "step": 1401, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31888291239738464, + "timestamp": "2025-09-05 09:00:01.005189", + "step": 1402, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:01.208357", + "step": 1402, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36028096079826355, + "timestamp": "2025-09-05 09:00:01.210255", + "step": 1403, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:01.414257", + "step": 1403, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2646711766719818, + "timestamp": "2025-09-05 09:00:01.428985", + "step": 1404, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:01.625056", + "step": 1404, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3558668792247772, + "timestamp": "2025-09-05 09:00:01.627352", + "step": 1405, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:01.824547", + "step": 1405, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3586845099925995, + "timestamp": "2025-09-05 09:00:01.827510", + "step": 1406, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:02.029391", + "step": 1406, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20658056437969208, + "timestamp": "2025-09-05 09:00:02.032797", + "step": 1407, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:02.243681", + "step": 1407, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3639076352119446, + "timestamp": "2025-09-05 09:00:02.260804", + "step": 1408, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:00:02.462268", + "step": 1408, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26506686210632324, + "timestamp": "2025-09-05 09:00:02.465508", + "step": 1409, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:02.664380", + "step": 1409, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25214824080467224, + "timestamp": "2025-09-05 09:00:02.667383", + "step": 1410, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:02.876346", + "step": 1410, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31711870431900024, + "timestamp": "2025-09-05 09:00:02.878119", + "step": 1411, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:03.088023", + "step": 1411, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5284944772720337, + "timestamp": "2025-09-05 09:00:03.102500", + "step": 1412, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:03.300795", + "step": 1412, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3170612156391144, + "timestamp": "2025-09-05 09:00:03.302474", + "step": 1413, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:03.511578", + "step": 1413, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19468443095684052, + "timestamp": "2025-09-05 09:00:03.513839", + "step": 1414, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:03.767100", + "step": 1414, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26165586709976196, + "timestamp": "2025-09-05 09:00:03.811286", + "step": 1415, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:04.059572", + "step": 1415, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32013124227523804, + "timestamp": "2025-09-05 09:00:04.069440", + "step": 1416, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:04.236919", + "step": 1416, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36106082797050476, + "timestamp": "2025-09-05 09:00:04.240225", + "step": 1417, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:04.455188", + "step": 1417, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3524361252784729, + "timestamp": "2025-09-05 09:00:04.458212", + "step": 1418, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:04.725938", + "step": 1418, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25635817646980286, + "timestamp": "2025-09-05 09:00:04.728692", + "step": 1419, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:04.941135", + "step": 1419, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24682959914207458, + "timestamp": "2025-09-05 09:00:04.957692", + "step": 1420, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:10.139186", + "step": 1420, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.352158708318434, + "timestamp": "2025-09-05 09:00:10.142827", + "step": 1420, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:10.304786", + "step": 1420, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40001311898231506, + "timestamp": "2025-09-05 09:00:10.308705", + "step": 1421, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:10.507444", + "step": 1421, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30492401123046875, + "timestamp": "2025-09-05 09:00:10.536910", + "step": 1422, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:10.790334", + "step": 1422, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3323606252670288, + "timestamp": "2025-09-05 09:00:10.792575", + "step": 1423, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:11.006202", + "step": 1423, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23769192397594452, + "timestamp": "2025-09-05 09:00:11.021212", + "step": 1424, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:11.234231", + "step": 1424, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3142143189907074, + "timestamp": "2025-09-05 09:00:11.237890", + "step": 1425, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:11.531231", + "step": 1425, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3281928300857544, + "timestamp": "2025-09-05 09:00:11.532868", + "step": 1426, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:11.742869", + "step": 1426, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4385247528553009, + "timestamp": "2025-09-05 09:00:11.744405", + "step": 1427, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:11.943393", + "step": 1427, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3019809126853943, + "timestamp": "2025-09-05 09:00:12.000833", + "step": 1428, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:12.245974", + "step": 1428, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.374027818441391, + "timestamp": "2025-09-05 09:00:12.247971", + "step": 1429, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:12.445020", + "step": 1429, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2926400601863861, + "timestamp": "2025-09-05 09:00:12.446711", + "step": 1430, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:12.616677", + "step": 1430, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2943369150161743, + "timestamp": "2025-09-05 09:00:12.618732", + "step": 1431, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:12.828550", + "step": 1431, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25276631116867065, + "timestamp": "2025-09-05 09:00:12.838911", + "step": 1432, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:00:13.004531", + "step": 1432, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47622016072273254, + "timestamp": "2025-09-05 09:00:13.006782", + "step": 1433, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:13.213452", + "step": 1433, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3429078161716461, + "timestamp": "2025-09-05 09:00:13.215745", + "step": 1434, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:13.423244", + "step": 1434, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2541375756263733, + "timestamp": "2025-09-05 09:00:13.425629", + "step": 1435, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:13.622643", + "step": 1435, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42097604274749756, + "timestamp": "2025-09-05 09:00:13.638488", + "step": 1436, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:13.834396", + "step": 1436, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29442155361175537, + "timestamp": "2025-09-05 09:00:13.838219", + "step": 1437, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:14.051866", + "step": 1437, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32426849007606506, + "timestamp": "2025-09-05 09:00:14.054791", + "step": 1438, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:14.267023", + "step": 1438, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3755996525287628, + "timestamp": "2025-09-05 09:00:14.269523", + "step": 1439, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:14.476738", + "step": 1439, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2896839678287506, + "timestamp": "2025-09-05 09:00:14.492129", + "step": 1440, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:19.555529", + "step": 1440, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.297274431293886, + "timestamp": "2025-09-05 09:00:19.558449", + "step": 1440, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1440", + "timestamp": "2025-09-05 09:00:20.192961", + "step": 1440, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:20.379456", + "step": 1440, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3088245391845703, + "timestamp": "2025-09-05 09:00:20.381264", + "step": 1441, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:20.582334", + "step": 1441, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40866950154304504, + "timestamp": "2025-09-05 09:00:20.584803", + "step": 1442, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:20.783709", + "step": 1442, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40102720260620117, + "timestamp": "2025-09-05 09:00:20.787143", + "step": 1443, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:00:20.985647", + "step": 1443, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29152411222457886, + "timestamp": "2025-09-05 09:00:21.001648", + "step": 1444, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:21.248024", + "step": 1444, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35733139514923096, + "timestamp": "2025-09-05 09:00:21.250634", + "step": 1445, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:21.560840", + "step": 1445, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2699089050292969, + "timestamp": "2025-09-05 09:00:21.563859", + "step": 1446, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:21.762868", + "step": 1446, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39894458651542664, + "timestamp": "2025-09-05 09:00:21.765328", + "step": 1447, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:21.973633", + "step": 1447, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31270933151245117, + "timestamp": "2025-09-05 09:00:21.988487", + "step": 1448, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:22.181427", + "step": 1448, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.468645304441452, + "timestamp": "2025-09-05 09:00:22.183887", + "step": 1449, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:22.382002", + "step": 1449, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2627173066139221, + "timestamp": "2025-09-05 09:00:22.384074", + "step": 1450, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:22.590432", + "step": 1450, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21120664477348328, + "timestamp": "2025-09-05 09:00:22.593672", + "step": 1451, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:22.845576", + "step": 1451, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37008100748062134, + "timestamp": "2025-09-05 09:00:22.860746", + "step": 1452, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:23.057719", + "step": 1452, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2987282872200012, + "timestamp": "2025-09-05 09:00:23.060153", + "step": 1453, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:23.328956", + "step": 1453, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4526394307613373, + "timestamp": "2025-09-05 09:00:23.330758", + "step": 1454, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:23.532425", + "step": 1454, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4313774108886719, + "timestamp": "2025-09-05 09:00:23.535905", + "step": 1455, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:23.738330", + "step": 1455, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3727917969226837, + "timestamp": "2025-09-05 09:00:23.754096", + "step": 1456, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:23.956198", + "step": 1456, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3371935486793518, + "timestamp": "2025-09-05 09:00:23.958709", + "step": 1457, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:24.156841", + "step": 1457, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29328054189682007, + "timestamp": "2025-09-05 09:00:24.158646", + "step": 1458, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:24.387752", + "step": 1458, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4609324336051941, + "timestamp": "2025-09-05 09:00:24.389694", + "step": 1459, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:24.586114", + "step": 1459, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3523360788822174, + "timestamp": "2025-09-05 09:00:24.600107", + "step": 1460, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:29.592360", + "step": 1460, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.27989146436504, + "timestamp": "2025-09-05 09:00:29.594078", + "step": 1460, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:29.757891", + "step": 1460, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.272165983915329, + "timestamp": "2025-09-05 09:00:29.760268", + "step": 1461, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:29.965527", + "step": 1461, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4126870334148407, + "timestamp": "2025-09-05 09:00:29.967387", + "step": 1462, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:00:30.219610", + "step": 1462, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3365689218044281, + "timestamp": "2025-09-05 09:00:30.221690", + "step": 1463, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:30.419896", + "step": 1463, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3105847239494324, + "timestamp": "2025-09-05 09:00:30.434605", + "step": 1464, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:30.624678", + "step": 1464, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3041870594024658, + "timestamp": "2025-09-05 09:00:30.669491", + "step": 1465, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:30.908522", + "step": 1465, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21601124107837677, + "timestamp": "2025-09-05 09:00:30.951473", + "step": 1466, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:31.201263", + "step": 1466, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.329571008682251, + "timestamp": "2025-09-05 09:00:31.203271", + "step": 1467, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:31.454663", + "step": 1467, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3269669711589813, + "timestamp": "2025-09-05 09:00:31.469803", + "step": 1468, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:31.664022", + "step": 1468, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45828115940093994, + "timestamp": "2025-09-05 09:00:31.666165", + "step": 1469, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:31.905612", + "step": 1469, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3074597716331482, + "timestamp": "2025-09-05 09:00:31.907252", + "step": 1470, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:32.113377", + "step": 1470, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45386582612991333, + "timestamp": "2025-09-05 09:00:32.115039", + "step": 1471, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:32.323220", + "step": 1471, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31208494305610657, + "timestamp": "2025-09-05 09:00:32.340131", + "step": 1472, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:32.539778", + "step": 1472, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30297181010246277, + "timestamp": "2025-09-05 09:00:32.546254", + "step": 1473, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:32.754227", + "step": 1473, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24466463923454285, + "timestamp": "2025-09-05 09:00:32.755965", + "step": 1474, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:33.051143", + "step": 1474, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22954264283180237, + "timestamp": "2025-09-05 09:00:33.052843", + "step": 1475, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:33.249729", + "step": 1475, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2577112317085266, + "timestamp": "2025-09-05 09:00:33.266023", + "step": 1476, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:33.454539", + "step": 1476, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4258563220500946, + "timestamp": "2025-09-05 09:00:33.496351", + "step": 1477, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:33.792317", + "step": 1477, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28466764092445374, + "timestamp": "2025-09-05 09:00:33.794271", + "step": 1478, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:34.046935", + "step": 1478, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3980422914028168, + "timestamp": "2025-09-05 09:00:34.093014", + "step": 1479, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:34.304162", + "step": 1479, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36734020709991455, + "timestamp": "2025-09-05 09:00:34.321087", + "step": 1480, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:39.490206", + "step": 1480, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.14511724528793, + "timestamp": "2025-09-05 09:00:39.492052", + "step": 1480, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1480", + "timestamp": "2025-09-05 09:00:39.944046", + "step": 1480, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:00:40.106662", + "step": 1480, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2696632444858551, + "timestamp": "2025-09-05 09:00:40.108366", + "step": 1481, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:40.306117", + "step": 1481, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2808653712272644, + "timestamp": "2025-09-05 09:00:40.308173", + "step": 1482, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:40.519058", + "step": 1482, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43718430399894714, + "timestamp": "2025-09-05 09:00:40.521238", + "step": 1483, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:40.729677", + "step": 1483, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28544357419013977, + "timestamp": "2025-09-05 09:00:40.746202", + "step": 1484, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:40.945290", + "step": 1484, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.18251703679561615, + "timestamp": "2025-09-05 09:00:40.947259", + "step": 1485, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:41.155284", + "step": 1485, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38339364528656006, + "timestamp": "2025-09-05 09:00:41.197437", + "step": 1486, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:41.450497", + "step": 1486, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34182044863700867, + "timestamp": "2025-09-05 09:00:41.494088", + "step": 1487, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:41.694560", + "step": 1487, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27841123938560486, + "timestamp": "2025-09-05 09:00:41.711449", + "step": 1488, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:41.917243", + "step": 1488, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.342913955450058, + "timestamp": "2025-09-05 09:00:41.920331", + "step": 1489, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:42.129925", + "step": 1489, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31036046147346497, + "timestamp": "2025-09-05 09:00:42.131756", + "step": 1490, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:42.339428", + "step": 1490, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24073675274848938, + "timestamp": "2025-09-05 09:00:42.415206", + "step": 1491, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:42.628445", + "step": 1491, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43674877285957336, + "timestamp": "2025-09-05 09:00:42.644955", + "step": 1492, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:42.928380", + "step": 1492, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2842003107070923, + "timestamp": "2025-09-05 09:00:42.930866", + "step": 1493, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:43.130467", + "step": 1493, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2694554924964905, + "timestamp": "2025-09-05 09:00:43.132760", + "step": 1494, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:43.344581", + "step": 1494, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4056885540485382, + "timestamp": "2025-09-05 09:00:43.387356", + "step": 1495, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:43.636302", + "step": 1495, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4579038918018341, + "timestamp": "2025-09-05 09:00:43.652646", + "step": 1496, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:43.851183", + "step": 1496, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31665748357772827, + "timestamp": "2025-09-05 09:00:43.853144", + "step": 1497, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:44.062967", + "step": 1497, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28368595242500305, + "timestamp": "2025-09-05 09:00:44.065000", + "step": 1498, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:44.266543", + "step": 1498, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36625730991363525, + "timestamp": "2025-09-05 09:00:44.268982", + "step": 1499, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:44.467553", + "step": 1499, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35754650831222534, + "timestamp": "2025-09-05 09:00:44.484265", + "step": 1500, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:49.578608", + "step": 1500, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.19569553462994, + "timestamp": "2025-09-05 09:00:49.580840", + "step": 1500, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:49.743136", + "step": 1500, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2610255479812622, + "timestamp": "2025-09-05 09:00:49.745210", + "step": 1501, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:49.911404", + "step": 1501, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3230501115322113, + "timestamp": "2025-09-05 09:00:49.913745", + "step": 1502, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:00:50.122490", + "step": 1502, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42344826459884644, + "timestamp": "2025-09-05 09:00:50.124894", + "step": 1503, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:50.448049", + "step": 1503, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27271905541419983, + "timestamp": "2025-09-05 09:00:50.504586", + "step": 1504, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:50.761945", + "step": 1504, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38530251383781433, + "timestamp": "2025-09-05 09:00:50.763482", + "step": 1505, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:50.933199", + "step": 1505, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19776545464992523, + "timestamp": "2025-09-05 09:00:50.934817", + "step": 1506, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:51.142295", + "step": 1506, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4128006398677826, + "timestamp": "2025-09-05 09:00:51.144305", + "step": 1507, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:51.342140", + "step": 1507, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2871294915676117, + "timestamp": "2025-09-05 09:00:51.358189", + "step": 1508, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:51.547224", + "step": 1508, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24478839337825775, + "timestamp": "2025-09-05 09:00:51.548878", + "step": 1509, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:51.715852", + "step": 1509, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2897082269191742, + "timestamp": "2025-09-05 09:00:51.717461", + "step": 1510, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:51.887071", + "step": 1510, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2650999426841736, + "timestamp": "2025-09-05 09:00:51.888963", + "step": 1511, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:52.083755", + "step": 1511, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33520272374153137, + "timestamp": "2025-09-05 09:00:52.100558", + "step": 1512, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:52.297285", + "step": 1512, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2601361572742462, + "timestamp": "2025-09-05 09:00:52.298983", + "step": 1513, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:52.494605", + "step": 1513, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3293619453907013, + "timestamp": "2025-09-05 09:00:52.496222", + "step": 1514, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:00:52.703711", + "step": 1514, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3966652750968933, + "timestamp": "2025-09-05 09:00:52.706032", + "step": 1515, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:52.902753", + "step": 1515, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46486595273017883, + "timestamp": "2025-09-05 09:00:52.912613", + "step": 1516, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:00:53.075232", + "step": 1516, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2847346067428589, + "timestamp": "2025-09-05 09:00:53.077884", + "step": 1517, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:53.247343", + "step": 1517, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3045147657394409, + "timestamp": "2025-09-05 09:00:53.264732", + "step": 1518, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:53.472369", + "step": 1518, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34377965331077576, + "timestamp": "2025-09-05 09:00:53.474258", + "step": 1519, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:53.681046", + "step": 1519, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.353103905916214, + "timestamp": "2025-09-05 09:00:53.695217", + "step": 1520, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:00:58.747097", + "step": 1520, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.199012932244166, + "timestamp": "2025-09-05 09:00:58.749409", + "step": 1520, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1520", + "timestamp": "2025-09-05 09:00:59.347636", + "step": 1520, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:00:59.530819", + "step": 1520, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2339445799589157, + "timestamp": "2025-09-05 09:00:59.532778", + "step": 1521, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:59.789659", + "step": 1521, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41792815923690796, + "timestamp": "2025-09-05 09:00:59.791267", + "step": 1522, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:00:59.997972", + "step": 1522, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26164618134498596, + "timestamp": "2025-09-05 09:01:00.001215", + "step": 1523, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:00.201106", + "step": 1523, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3471045196056366, + "timestamp": "2025-09-05 09:01:00.215901", + "step": 1524, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:00.461448", + "step": 1524, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38675960898399353, + "timestamp": "2025-09-05 09:01:00.464367", + "step": 1525, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:00.672850", + "step": 1525, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36886975169181824, + "timestamp": "2025-09-05 09:01:00.674827", + "step": 1526, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:00.875762", + "step": 1526, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2916344106197357, + "timestamp": "2025-09-05 09:01:00.877994", + "step": 1527, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:01.186868", + "step": 1527, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2596033215522766, + "timestamp": "2025-09-05 09:01:01.245041", + "step": 1528, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:01.553114", + "step": 1528, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41993069648742676, + "timestamp": "2025-09-05 09:01:01.554722", + "step": 1529, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:01.762589", + "step": 1529, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3928647041320801, + "timestamp": "2025-09-05 09:01:01.764333", + "step": 1530, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:01.963776", + "step": 1530, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24546408653259277, + "timestamp": "2025-09-05 09:01:01.966150", + "step": 1531, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:02.164161", + "step": 1531, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3285897970199585, + "timestamp": "2025-09-05 09:01:02.180529", + "step": 1532, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:02.380667", + "step": 1532, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23597146570682526, + "timestamp": "2025-09-05 09:01:02.382749", + "step": 1533, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:02.592689", + "step": 1533, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2703210413455963, + "timestamp": "2025-09-05 09:01:02.594777", + "step": 1534, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:02.845322", + "step": 1534, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3228624165058136, + "timestamp": "2025-09-05 09:01:02.847462", + "step": 1535, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:03.047096", + "step": 1535, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3633241355419159, + "timestamp": "2025-09-05 09:01:03.061335", + "step": 1536, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:03.258802", + "step": 1536, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3116025924682617, + "timestamp": "2025-09-05 09:01:03.261151", + "step": 1537, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:03.505735", + "step": 1537, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2656325101852417, + "timestamp": "2025-09-05 09:01:03.507750", + "step": 1538, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:03.714858", + "step": 1538, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.502154529094696, + "timestamp": "2025-09-05 09:01:03.757830", + "step": 1539, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:04.001581", + "step": 1539, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25501522421836853, + "timestamp": "2025-09-05 09:01:04.056208", + "step": 1540, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:09.059460", + "step": 1540, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.22339176817989, + "timestamp": "2025-09-05 09:01:09.061549", + "step": 1540, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:09.222637", + "step": 1540, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33964210748672485, + "timestamp": "2025-09-05 09:01:09.224561", + "step": 1541, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:09.390371", + "step": 1541, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3216552436351776, + "timestamp": "2025-09-05 09:01:09.392053", + "step": 1542, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:09.609437", + "step": 1542, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46677759289741516, + "timestamp": "2025-09-05 09:01:09.611720", + "step": 1543, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:09.820890", + "step": 1543, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4672287404537201, + "timestamp": "2025-09-05 09:01:09.835069", + "step": 1544, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:10.024285", + "step": 1544, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31345078349113464, + "timestamp": "2025-09-05 09:01:10.026199", + "step": 1545, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:10.235065", + "step": 1545, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25660619139671326, + "timestamp": "2025-09-05 09:01:10.237026", + "step": 1546, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:10.441232", + "step": 1546, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2932271957397461, + "timestamp": "2025-09-05 09:01:10.443096", + "step": 1547, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:10.609307", + "step": 1547, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28923869132995605, + "timestamp": "2025-09-05 09:01:10.626261", + "step": 1548, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:10.873301", + "step": 1548, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2676112949848175, + "timestamp": "2025-09-05 09:01:10.875539", + "step": 1549, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:11.121598", + "step": 1549, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24210013449192047, + "timestamp": "2025-09-05 09:01:11.123753", + "step": 1550, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:11.320871", + "step": 1550, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31983768939971924, + "timestamp": "2025-09-05 09:01:11.322439", + "step": 1551, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:11.529052", + "step": 1551, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2809242904186249, + "timestamp": "2025-09-05 09:01:11.543506", + "step": 1552, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:11.788938", + "step": 1552, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3017968237400055, + "timestamp": "2025-09-05 09:01:11.791257", + "step": 1553, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:12.041437", + "step": 1553, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32433098554611206, + "timestamp": "2025-09-05 09:01:12.043576", + "step": 1554, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:12.248072", + "step": 1554, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5199801921844482, + "timestamp": "2025-09-05 09:01:12.249996", + "step": 1555, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:01:12.416761", + "step": 1555, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.303520530462265, + "timestamp": "2025-09-05 09:01:12.476023", + "step": 1556, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:12.719674", + "step": 1556, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4727158844470978, + "timestamp": "2025-09-05 09:01:12.721369", + "step": 1557, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:12.920588", + "step": 1557, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3274668753147125, + "timestamp": "2025-09-05 09:01:12.922360", + "step": 1558, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:13.120064", + "step": 1558, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35360151529312134, + "timestamp": "2025-09-05 09:01:13.121761", + "step": 1559, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:13.304478", + "step": 1559, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4361201524734497, + "timestamp": "2025-09-05 09:01:13.313794", + "step": 1560, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:18.338585", + "step": 1560, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.41456573621123, + "timestamp": "2025-09-05 09:01:18.350072", + "step": 1560, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1560", + "timestamp": "2025-09-05 09:01:18.864357", + "step": 1560, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:19.052137", + "step": 1560, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3437816798686981, + "timestamp": "2025-09-05 09:01:19.086911", + "step": 1561, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:19.346975", + "step": 1561, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3193230926990509, + "timestamp": "2025-09-05 09:01:19.348708", + "step": 1562, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:19.554605", + "step": 1562, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3251970708370209, + "timestamp": "2025-09-05 09:01:19.560596", + "step": 1563, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:19.760161", + "step": 1563, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3838941156864166, + "timestamp": "2025-09-05 09:01:19.776545", + "step": 1564, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:19.974450", + "step": 1564, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3207242488861084, + "timestamp": "2025-09-05 09:01:19.976164", + "step": 1565, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:20.242298", + "step": 1565, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3163876235485077, + "timestamp": "2025-09-05 09:01:20.244368", + "step": 1566, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:20.440224", + "step": 1566, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2813356816768646, + "timestamp": "2025-09-05 09:01:20.442252", + "step": 1567, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:20.647109", + "step": 1567, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19117353856563568, + "timestamp": "2025-09-05 09:01:20.661404", + "step": 1568, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:20.888763", + "step": 1568, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34964126348495483, + "timestamp": "2025-09-05 09:01:20.890577", + "step": 1569, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:21.138262", + "step": 1569, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3004063367843628, + "timestamp": "2025-09-05 09:01:21.141055", + "step": 1570, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:21.335800", + "step": 1570, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3171744644641876, + "timestamp": "2025-09-05 09:01:21.339587", + "step": 1571, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:21.533662", + "step": 1571, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47390928864479065, + "timestamp": "2025-09-05 09:01:21.551024", + "step": 1572, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:21.746161", + "step": 1572, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34400877356529236, + "timestamp": "2025-09-05 09:01:21.749209", + "step": 1573, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:21.914493", + "step": 1573, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3936767876148224, + "timestamp": "2025-09-05 09:01:21.917943", + "step": 1574, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:22.122702", + "step": 1574, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2832147181034088, + "timestamp": "2025-09-05 09:01:22.125039", + "step": 1575, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:22.291669", + "step": 1575, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2926376760005951, + "timestamp": "2025-09-05 09:01:22.309042", + "step": 1576, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:22.551858", + "step": 1576, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4421359896659851, + "timestamp": "2025-09-05 09:01:22.555003", + "step": 1577, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:22.749743", + "step": 1577, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3380582928657532, + "timestamp": "2025-09-05 09:01:22.752462", + "step": 1578, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:22.949713", + "step": 1578, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23209118843078613, + "timestamp": "2025-09-05 09:01:22.952426", + "step": 1579, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:23.244611", + "step": 1579, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29452741146087646, + "timestamp": "2025-09-05 09:01:23.260250", + "step": 1580, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:28.387804", + "step": 1580, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.31358010394533, + "timestamp": "2025-09-05 09:01:28.390146", + "step": 1580, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:28.552853", + "step": 1580, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22305051982402802, + "timestamp": "2025-09-05 09:01:28.555962", + "step": 1581, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:28.762299", + "step": 1581, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4951125383377075, + "timestamp": "2025-09-05 09:01:28.765755", + "step": 1582, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:28.965837", + "step": 1582, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2661086320877075, + "timestamp": "2025-09-05 09:01:28.967787", + "step": 1583, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:29.150701", + "step": 1583, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3857158124446869, + "timestamp": "2025-09-05 09:01:29.160993", + "step": 1584, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:29.324956", + "step": 1584, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32872211933135986, + "timestamp": "2025-09-05 09:01:29.326514", + "step": 1585, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:01:29.531376", + "step": 1585, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3093208074569702, + "timestamp": "2025-09-05 09:01:29.533473", + "step": 1586, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:29.733192", + "step": 1586, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29720935225486755, + "timestamp": "2025-09-05 09:01:29.735435", + "step": 1587, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:29.930239", + "step": 1587, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3052479922771454, + "timestamp": "2025-09-05 09:01:29.946354", + "step": 1588, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:30.138120", + "step": 1588, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3246830105781555, + "timestamp": "2025-09-05 09:01:30.140093", + "step": 1589, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:30.336315", + "step": 1589, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4002441465854645, + "timestamp": "2025-09-05 09:01:30.338455", + "step": 1590, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:30.534261", + "step": 1590, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34371864795684814, + "timestamp": "2025-09-05 09:01:30.577760", + "step": 1591, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:30.775757", + "step": 1591, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30698439478874207, + "timestamp": "2025-09-05 09:01:30.790514", + "step": 1592, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:01:30.980985", + "step": 1592, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4689514636993408, + "timestamp": "2025-09-05 09:01:30.982847", + "step": 1593, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:31.187822", + "step": 1593, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3119965195655823, + "timestamp": "2025-09-05 09:01:31.189428", + "step": 1594, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:31.384273", + "step": 1594, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4260518550872803, + "timestamp": "2025-09-05 09:01:31.386455", + "step": 1595, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:31.584055", + "step": 1595, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42162322998046875, + "timestamp": "2025-09-05 09:01:31.598523", + "step": 1596, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:31.842004", + "step": 1596, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3950115144252777, + "timestamp": "2025-09-05 09:01:31.844535", + "step": 1597, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:32.042926", + "step": 1597, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3251371681690216, + "timestamp": "2025-09-05 09:01:32.045798", + "step": 1598, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:32.242051", + "step": 1598, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4372723698616028, + "timestamp": "2025-09-05 09:01:32.245518", + "step": 1599, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:32.442582", + "step": 1599, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2939607799053192, + "timestamp": "2025-09-05 09:01:32.459198", + "step": 1600, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:37.613280", + "step": 1600, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.03260913326416, + "timestamp": "2025-09-05 09:01:37.615081", + "step": 1600, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1600", + "timestamp": "2025-09-05 09:01:38.059228", + "step": 1600, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:01:38.220521", + "step": 1600, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4079391062259674, + "timestamp": "2025-09-05 09:01:38.222270", + "step": 1601, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:38.426775", + "step": 1601, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26604366302490234, + "timestamp": "2025-09-05 09:01:38.428405", + "step": 1602, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:38.632567", + "step": 1602, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.270683616399765, + "timestamp": "2025-09-05 09:01:38.634094", + "step": 1603, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:38.832221", + "step": 1603, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3520897924900055, + "timestamp": "2025-09-05 09:01:38.846486", + "step": 1604, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:39.035039", + "step": 1604, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4065357744693756, + "timestamp": "2025-09-05 09:01:39.037089", + "step": 1605, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:01:39.231198", + "step": 1605, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4396074414253235, + "timestamp": "2025-09-05 09:01:39.233010", + "step": 1606, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:39.432802", + "step": 1606, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19596950709819794, + "timestamp": "2025-09-05 09:01:39.434547", + "step": 1607, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:39.683964", + "step": 1607, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34500351548194885, + "timestamp": "2025-09-05 09:01:39.698962", + "step": 1608, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:39.890897", + "step": 1608, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29165971279144287, + "timestamp": "2025-09-05 09:01:39.892845", + "step": 1609, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:40.091985", + "step": 1609, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41614753007888794, + "timestamp": "2025-09-05 09:01:40.094774", + "step": 1610, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:40.345641", + "step": 1610, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35462021827697754, + "timestamp": "2025-09-05 09:01:40.349976", + "step": 1611, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:01:40.611172", + "step": 1611, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.440545916557312, + "timestamp": "2025-09-05 09:01:40.626023", + "step": 1612, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:40.824860", + "step": 1612, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30644723773002625, + "timestamp": "2025-09-05 09:01:40.827413", + "step": 1613, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:41.078955", + "step": 1613, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31383422017097473, + "timestamp": "2025-09-05 09:01:41.081534", + "step": 1614, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:41.278142", + "step": 1614, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4448195695877075, + "timestamp": "2025-09-05 09:01:41.280472", + "step": 1615, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:41.473811", + "step": 1615, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2656569480895996, + "timestamp": "2025-09-05 09:01:41.488401", + "step": 1616, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:41.678286", + "step": 1616, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27651652693748474, + "timestamp": "2025-09-05 09:01:41.680091", + "step": 1617, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:41.929276", + "step": 1617, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22921140491962433, + "timestamp": "2025-09-05 09:01:41.931200", + "step": 1618, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:42.128588", + "step": 1618, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24880188703536987, + "timestamp": "2025-09-05 09:01:42.130379", + "step": 1619, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:42.339134", + "step": 1619, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27319225668907166, + "timestamp": "2025-09-05 09:01:42.395861", + "step": 1620, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:47.860884", + "step": 1620, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.03103150706018, + "timestamp": "2025-09-05 09:01:47.862612", + "step": 1620, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:48.026237", + "step": 1620, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2473212331533432, + "timestamp": "2025-09-05 09:01:48.069969", + "step": 1621, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:48.239318", + "step": 1621, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30699431896209717, + "timestamp": "2025-09-05 09:01:48.241670", + "step": 1622, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:48.447505", + "step": 1622, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47052350640296936, + "timestamp": "2025-09-05 09:01:48.449557", + "step": 1623, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:48.656637", + "step": 1623, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3628225028514862, + "timestamp": "2025-09-05 09:01:48.714869", + "step": 1624, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:48.914680", + "step": 1624, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26334837079048157, + "timestamp": "2025-09-05 09:01:48.916727", + "step": 1625, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:49.115550", + "step": 1625, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37516269087791443, + "timestamp": "2025-09-05 09:01:49.117110", + "step": 1626, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:01:49.315505", + "step": 1626, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5237755179405212, + "timestamp": "2025-09-05 09:01:49.317923", + "step": 1627, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:49.526667", + "step": 1627, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20416662096977234, + "timestamp": "2025-09-05 09:01:49.540424", + "step": 1628, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:49.731734", + "step": 1628, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3799273371696472, + "timestamp": "2025-09-05 09:01:49.733539", + "step": 1629, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:49.928401", + "step": 1629, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4185466170310974, + "timestamp": "2025-09-05 09:01:49.930768", + "step": 1630, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:50.137540", + "step": 1630, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28220972418785095, + "timestamp": "2025-09-05 09:01:50.140239", + "step": 1631, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:50.348233", + "step": 1631, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2010842263698578, + "timestamp": "2025-09-05 09:01:50.363989", + "step": 1632, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:50.607652", + "step": 1632, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29958879947662354, + "timestamp": "2025-09-05 09:01:50.610206", + "step": 1633, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:50.808964", + "step": 1633, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33167004585266113, + "timestamp": "2025-09-05 09:01:50.811115", + "step": 1634, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:51.009267", + "step": 1634, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24998310208320618, + "timestamp": "2025-09-05 09:01:51.031265", + "step": 1635, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:51.282600", + "step": 1635, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.311273455619812, + "timestamp": "2025-09-05 09:01:51.297279", + "step": 1636, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:51.488464", + "step": 1636, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28948909044265747, + "timestamp": "2025-09-05 09:01:51.490652", + "step": 1637, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:01:51.746233", + "step": 1637, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2414962649345398, + "timestamp": "2025-09-05 09:01:51.748103", + "step": 1638, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:51.945697", + "step": 1638, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26412102580070496, + "timestamp": "2025-09-05 09:01:51.947451", + "step": 1639, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:52.142544", + "step": 1639, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27432331442832947, + "timestamp": "2025-09-05 09:01:52.160428", + "step": 1640, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:01:56.942968", + "step": 1640, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.70964771417281, + "timestamp": "2025-09-05 09:01:56.947175", + "step": 1640, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1640", + "timestamp": "2025-09-05 09:01:57.598988", + "step": 1640, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:57.841522", + "step": 1640, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32458359003067017, + "timestamp": "2025-09-05 09:01:57.843776", + "step": 1641, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:58.040943", + "step": 1641, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4251089096069336, + "timestamp": "2025-09-05 09:01:58.042669", + "step": 1642, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:58.240063", + "step": 1642, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4108656942844391, + "timestamp": "2025-09-05 09:01:58.241816", + "step": 1643, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:58.437720", + "step": 1643, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38851895928382874, + "timestamp": "2025-09-05 09:01:58.454504", + "step": 1644, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:58.651986", + "step": 1644, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2178976833820343, + "timestamp": "2025-09-05 09:01:58.654067", + "step": 1645, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:01:58.822924", + "step": 1645, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35721564292907715, + "timestamp": "2025-09-05 09:01:58.825316", + "step": 1646, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:01:59.044925", + "step": 1646, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17846766114234924, + "timestamp": "2025-09-05 09:01:59.047352", + "step": 1647, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:59.215050", + "step": 1647, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38131412863731384, + "timestamp": "2025-09-05 09:01:59.231949", + "step": 1648, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:01:59.428199", + "step": 1648, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.16115841269493103, + "timestamp": "2025-09-05 09:01:59.430055", + "step": 1649, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:01:59.636473", + "step": 1649, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45021679997444153, + "timestamp": "2025-09-05 09:01:59.638966", + "step": 1650, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:01:59.848917", + "step": 1650, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3558279871940613, + "timestamp": "2025-09-05 09:01:59.851023", + "step": 1651, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:00.096329", + "step": 1651, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40276896953582764, + "timestamp": "2025-09-05 09:02:00.113286", + "step": 1652, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:00.312785", + "step": 1652, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29273685812950134, + "timestamp": "2025-09-05 09:02:00.314731", + "step": 1653, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:00.563338", + "step": 1653, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4775174558162689, + "timestamp": "2025-09-05 09:02:00.565910", + "step": 1654, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:00.763870", + "step": 1654, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3428560793399811, + "timestamp": "2025-09-05 09:02:00.767063", + "step": 1655, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:01.016867", + "step": 1655, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3681947588920593, + "timestamp": "2025-09-05 09:02:01.031643", + "step": 1656, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:01.218711", + "step": 1656, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4562534689903259, + "timestamp": "2025-09-05 09:02:01.220599", + "step": 1657, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:01.606943", + "step": 1657, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38250502943992615, + "timestamp": "2025-09-05 09:02:01.609870", + "step": 1658, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:01.806481", + "step": 1658, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4949440658092499, + "timestamp": "2025-09-05 09:02:01.809636", + "step": 1659, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:02.006945", + "step": 1659, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30622151494026184, + "timestamp": "2025-09-05 09:02:02.022446", + "step": 1660, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:07.306404", + "step": 1660, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.28318964602614, + "timestamp": "2025-09-05 09:02:07.308770", + "step": 1660, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:07.471588", + "step": 1660, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41278305649757385, + "timestamp": "2025-09-05 09:02:07.473751", + "step": 1661, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:07.641861", + "step": 1661, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2784328758716583, + "timestamp": "2025-09-05 09:02:07.644612", + "step": 1662, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:07.853696", + "step": 1662, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3032529056072235, + "timestamp": "2025-09-05 09:02:07.855716", + "step": 1663, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:08.056549", + "step": 1663, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3818753659725189, + "timestamp": "2025-09-05 09:02:08.071192", + "step": 1664, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:08.260992", + "step": 1664, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.49340173602104187, + "timestamp": "2025-09-05 09:02:08.263931", + "step": 1665, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:08.471439", + "step": 1665, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30880993604660034, + "timestamp": "2025-09-05 09:02:08.474238", + "step": 1666, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:08.681287", + "step": 1666, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34679117798805237, + "timestamp": "2025-09-05 09:02:08.683188", + "step": 1667, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:08.890210", + "step": 1667, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2942996919155121, + "timestamp": "2025-09-05 09:02:08.905288", + "step": 1668, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:09.094441", + "step": 1668, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43989962339401245, + "timestamp": "2025-09-05 09:02:09.096279", + "step": 1669, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:09.302837", + "step": 1669, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30262741446495056, + "timestamp": "2025-09-05 09:02:09.310213", + "step": 1670, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:02:09.509474", + "step": 1670, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2830575406551361, + "timestamp": "2025-09-05 09:02:09.511875", + "step": 1671, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:09.708618", + "step": 1671, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23973654210567474, + "timestamp": "2025-09-05 09:02:09.723564", + "step": 1672, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:09.912351", + "step": 1672, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4138979911804199, + "timestamp": "2025-09-05 09:02:09.914678", + "step": 1673, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:10.120830", + "step": 1673, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33984220027923584, + "timestamp": "2025-09-05 09:02:10.122661", + "step": 1674, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:10.330661", + "step": 1674, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4164768159389496, + "timestamp": "2025-09-05 09:02:10.332757", + "step": 1675, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:10.528555", + "step": 1675, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38378530740737915, + "timestamp": "2025-09-05 09:02:10.545099", + "step": 1676, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:10.752714", + "step": 1676, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4028756320476532, + "timestamp": "2025-09-05 09:02:10.755044", + "step": 1677, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:10.948364", + "step": 1677, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31969425082206726, + "timestamp": "2025-09-05 09:02:10.950693", + "step": 1678, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:11.157995", + "step": 1678, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41612645983695984, + "timestamp": "2025-09-05 09:02:11.160072", + "step": 1679, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:11.366276", + "step": 1679, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3250633478164673, + "timestamp": "2025-09-05 09:02:11.381192", + "step": 1680, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:16.366281", + "step": 1680, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.385877678644015, + "timestamp": "2025-09-05 09:02:16.371386", + "step": 1680, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1680", + "timestamp": "2025-09-05 09:02:16.822315", + "step": 1680, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:17.010071", + "step": 1680, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22635141015052795, + "timestamp": "2025-09-05 09:02:17.013493", + "step": 1681, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:17.221771", + "step": 1681, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3104085326194763, + "timestamp": "2025-09-05 09:02:17.223669", + "step": 1682, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:17.422856", + "step": 1682, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3125440776348114, + "timestamp": "2025-09-05 09:02:17.424683", + "step": 1683, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:17.590705", + "step": 1683, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28808438777923584, + "timestamp": "2025-09-05 09:02:17.624104", + "step": 1684, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:17.822056", + "step": 1684, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3280973434448242, + "timestamp": "2025-09-05 09:02:17.824459", + "step": 1685, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:18.074431", + "step": 1685, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4642024636268616, + "timestamp": "2025-09-05 09:02:18.076468", + "step": 1686, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:18.245941", + "step": 1686, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4451746940612793, + "timestamp": "2025-09-05 09:02:18.248641", + "step": 1687, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:18.445851", + "step": 1687, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4639004170894623, + "timestamp": "2025-09-05 09:02:18.460296", + "step": 1688, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:18.647761", + "step": 1688, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44137394428253174, + "timestamp": "2025-09-05 09:02:18.649593", + "step": 1689, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:18.855698", + "step": 1689, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3657119870185852, + "timestamp": "2025-09-05 09:02:18.857681", + "step": 1690, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:19.054479", + "step": 1690, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38898834586143494, + "timestamp": "2025-09-05 09:02:19.056545", + "step": 1691, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:19.252320", + "step": 1691, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3620879352092743, + "timestamp": "2025-09-05 09:02:19.266955", + "step": 1692, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:19.456051", + "step": 1692, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3081320822238922, + "timestamp": "2025-09-05 09:02:19.457978", + "step": 1693, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:19.625303", + "step": 1693, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4219552278518677, + "timestamp": "2025-09-05 09:02:19.627423", + "step": 1694, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:02:19.832534", + "step": 1694, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31051498651504517, + "timestamp": "2025-09-05 09:02:19.834368", + "step": 1695, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:20.039831", + "step": 1695, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.18346603214740753, + "timestamp": "2025-09-05 09:02:20.054596", + "step": 1696, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:20.251930", + "step": 1696, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2537613809108734, + "timestamp": "2025-09-05 09:02:20.253750", + "step": 1697, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:20.450601", + "step": 1697, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4183553457260132, + "timestamp": "2025-09-05 09:02:20.452396", + "step": 1698, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:20.659171", + "step": 1698, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3628275990486145, + "timestamp": "2025-09-05 09:02:20.661000", + "step": 1699, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:20.856750", + "step": 1699, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3655959367752075, + "timestamp": "2025-09-05 09:02:20.871615", + "step": 1700, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:25.746505", + "step": 1700, + "epoch": 2 + }, + { + "type": "pplx", + "content": 52.8854128909846, + "timestamp": "2025-09-05 09:02:25.748403", + "step": 1700, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:25.911309", + "step": 1700, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5494905710220337, + "timestamp": "2025-09-05 09:02:25.913231", + "step": 1701, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:26.118423", + "step": 1701, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24261198937892914, + "timestamp": "2025-09-05 09:02:26.121092", + "step": 1702, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:26.317960", + "step": 1702, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3774292469024658, + "timestamp": "2025-09-05 09:02:26.320170", + "step": 1703, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:26.488714", + "step": 1703, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3856517970561981, + "timestamp": "2025-09-05 09:02:26.506248", + "step": 1704, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:26.695169", + "step": 1704, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31671613454818726, + "timestamp": "2025-09-05 09:02:26.698931", + "step": 1705, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:02:26.947442", + "step": 1705, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33507075905799866, + "timestamp": "2025-09-05 09:02:26.980111", + "step": 1706, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:27.187298", + "step": 1706, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32331711053848267, + "timestamp": "2025-09-05 09:02:27.189573", + "step": 1707, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:27.355286", + "step": 1707, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34161141514778137, + "timestamp": "2025-09-05 09:02:27.372454", + "step": 1708, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:27.568904", + "step": 1708, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29065045714378357, + "timestamp": "2025-09-05 09:02:27.570686", + "step": 1709, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:27.778591", + "step": 1709, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22849217057228088, + "timestamp": "2025-09-05 09:02:27.780580", + "step": 1710, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:27.947810", + "step": 1710, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3984873592853546, + "timestamp": "2025-09-05 09:02:27.949766", + "step": 1711, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:28.154332", + "step": 1711, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26712897419929504, + "timestamp": "2025-09-05 09:02:28.168755", + "step": 1712, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:28.358064", + "step": 1712, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33036503195762634, + "timestamp": "2025-09-05 09:02:28.359870", + "step": 1713, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:28.528281", + "step": 1713, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40827813744544983, + "timestamp": "2025-09-05 09:02:28.531089", + "step": 1714, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:28.785579", + "step": 1714, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30413565039634705, + "timestamp": "2025-09-05 09:02:28.787924", + "step": 1715, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:29.125532", + "step": 1715, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33635202050209045, + "timestamp": "2025-09-05 09:02:29.141494", + "step": 1716, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:02:29.384162", + "step": 1716, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30479493737220764, + "timestamp": "2025-09-05 09:02:29.385834", + "step": 1717, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:29.580567", + "step": 1717, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25971081852912903, + "timestamp": "2025-09-05 09:02:29.582717", + "step": 1718, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:29.788226", + "step": 1718, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33488330245018005, + "timestamp": "2025-09-05 09:02:29.790269", + "step": 1719, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:30.038248", + "step": 1719, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2982542812824249, + "timestamp": "2025-09-05 09:02:30.053364", + "step": 1720, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:36.028498", + "step": 1720, + "epoch": 2 + }, + { + "type": "pplx", + "content": 52.41845639341919, + "timestamp": "2025-09-05 09:02:36.031743", + "step": 1720, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1720", + "timestamp": "2025-09-05 09:02:36.853432", + "step": 1720, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:37.032163", + "step": 1720, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2861115038394928, + "timestamp": "2025-09-05 09:02:37.034925", + "step": 1721, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:37.234316", + "step": 1721, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22263473272323608, + "timestamp": "2025-09-05 09:02:37.236395", + "step": 1722, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:37.435471", + "step": 1722, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2188272327184677, + "timestamp": "2025-09-05 09:02:37.437670", + "step": 1723, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:37.636131", + "step": 1723, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3807090222835541, + "timestamp": "2025-09-05 09:02:37.651194", + "step": 1724, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:37.842885", + "step": 1724, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3141788840293884, + "timestamp": "2025-09-05 09:02:37.844950", + "step": 1725, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:38.041711", + "step": 1725, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28930506110191345, + "timestamp": "2025-09-05 09:02:38.043434", + "step": 1726, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:38.240729", + "step": 1726, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3060302734375, + "timestamp": "2025-09-05 09:02:38.242833", + "step": 1727, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:02:38.442684", + "step": 1727, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3646160662174225, + "timestamp": "2025-09-05 09:02:38.459198", + "step": 1728, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:38.657264", + "step": 1728, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4415394365787506, + "timestamp": "2025-09-05 09:02:38.660657", + "step": 1729, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:38.867898", + "step": 1729, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.330495148897171, + "timestamp": "2025-09-05 09:02:38.870259", + "step": 1730, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:39.068825", + "step": 1730, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30471497774124146, + "timestamp": "2025-09-05 09:02:39.070707", + "step": 1731, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:39.267010", + "step": 1731, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3218672275543213, + "timestamp": "2025-09-05 09:02:39.281489", + "step": 1732, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:39.475538", + "step": 1732, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.467593252658844, + "timestamp": "2025-09-05 09:02:39.477300", + "step": 1733, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:39.726323", + "step": 1733, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37956318259239197, + "timestamp": "2025-09-05 09:02:39.728240", + "step": 1734, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:39.925216", + "step": 1734, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4981067478656769, + "timestamp": "2025-09-05 09:02:39.927497", + "step": 1735, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:40.135632", + "step": 1735, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31048834323883057, + "timestamp": "2025-09-05 09:02:40.150598", + "step": 1736, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:40.366202", + "step": 1736, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.283828467130661, + "timestamp": "2025-09-05 09:02:40.368169", + "step": 1737, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:40.535557", + "step": 1737, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25476568937301636, + "timestamp": "2025-09-05 09:02:40.538151", + "step": 1738, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:40.752855", + "step": 1738, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23773322999477386, + "timestamp": "2025-09-05 09:02:40.754924", + "step": 1739, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:40.962448", + "step": 1739, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3678232729434967, + "timestamp": "2025-09-05 09:02:40.977673", + "step": 1740, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:46.695748", + "step": 1740, + "epoch": 2 + }, + { + "type": "pplx", + "content": 52.87610880309225, + "timestamp": "2025-09-05 09:02:46.697902", + "step": 1740, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:46.860812", + "step": 1740, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42199501395225525, + "timestamp": "2025-09-05 09:02:46.862869", + "step": 1741, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:47.030713", + "step": 1741, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27012646198272705, + "timestamp": "2025-09-05 09:02:47.048230", + "step": 1742, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:47.253601", + "step": 1742, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33601266145706177, + "timestamp": "2025-09-05 09:02:47.255971", + "step": 1743, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:47.450875", + "step": 1743, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34322261810302734, + "timestamp": "2025-09-05 09:02:47.466057", + "step": 1744, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:47.707178", + "step": 1744, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21854346990585327, + "timestamp": "2025-09-05 09:02:47.709562", + "step": 1745, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:47.915477", + "step": 1745, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2908678352832794, + "timestamp": "2025-09-05 09:02:47.917360", + "step": 1746, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:02:48.084021", + "step": 1746, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22152143716812134, + "timestamp": "2025-09-05 09:02:48.167207", + "step": 1747, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:48.412311", + "step": 1747, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3264712691307068, + "timestamp": "2025-09-05 09:02:48.427033", + "step": 1748, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:48.615841", + "step": 1748, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33076176047325134, + "timestamp": "2025-09-05 09:02:48.658513", + "step": 1749, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:02:48.907009", + "step": 1749, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29463064670562744, + "timestamp": "2025-09-05 09:02:48.909234", + "step": 1750, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:49.117191", + "step": 1750, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3289870023727417, + "timestamp": "2025-09-05 09:02:49.119123", + "step": 1751, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:49.314893", + "step": 1751, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35243523120880127, + "timestamp": "2025-09-05 09:02:49.329655", + "step": 1752, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:49.654407", + "step": 1752, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39635375142097473, + "timestamp": "2025-09-05 09:02:49.656285", + "step": 1753, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:49.861685", + "step": 1753, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23487235605716705, + "timestamp": "2025-09-05 09:02:49.864300", + "step": 1754, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:50.061153", + "step": 1754, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3939005136489868, + "timestamp": "2025-09-05 09:02:50.103946", + "step": 1755, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:50.438452", + "step": 1755, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3778200149536133, + "timestamp": "2025-09-05 09:02:50.448072", + "step": 1756, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:50.613169", + "step": 1756, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2921392619609833, + "timestamp": "2025-09-05 09:02:50.614850", + "step": 1757, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:02:50.819770", + "step": 1757, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24706591665744781, + "timestamp": "2025-09-05 09:02:50.821640", + "step": 1758, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:02:51.019320", + "step": 1758, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3144387900829315, + "timestamp": "2025-09-05 09:02:51.020986", + "step": 1759, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:02:51.218943", + "step": 1759, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45034703612327576, + "timestamp": "2025-09-05 09:02:51.233986", + "step": 1760, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:02:56.821336", + "step": 1760, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.89968272891614, + "timestamp": "2025-09-05 09:02:56.825297", + "step": 1760, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1760", + "timestamp": "2025-09-05 09:02:57.358500", + "step": 1760, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:57.525650", + "step": 1760, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2909316122531891, + "timestamp": "2025-09-05 09:02:57.527607", + "step": 1761, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:02:57.732796", + "step": 1761, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.372611939907074, + "timestamp": "2025-09-05 09:02:57.734962", + "step": 1762, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:57.982758", + "step": 1762, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3195144534111023, + "timestamp": "2025-09-05 09:02:57.984997", + "step": 1763, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:58.191741", + "step": 1763, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2733082175254822, + "timestamp": "2025-09-05 09:02:58.206696", + "step": 1764, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:58.394732", + "step": 1764, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34646353125572205, + "timestamp": "2025-09-05 09:02:58.397168", + "step": 1765, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:02:58.604597", + "step": 1765, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2818068265914917, + "timestamp": "2025-09-05 09:02:58.666650", + "step": 1766, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:58.946660", + "step": 1766, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20438231527805328, + "timestamp": "2025-09-05 09:02:58.948538", + "step": 1767, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:59.147170", + "step": 1767, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2941116690635681, + "timestamp": "2025-09-05 09:02:59.157344", + "step": 1768, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:02:59.318970", + "step": 1768, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.387058287858963, + "timestamp": "2025-09-05 09:02:59.341479", + "step": 1769, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:59.590735", + "step": 1769, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5514186024665833, + "timestamp": "2025-09-05 09:02:59.593040", + "step": 1770, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:02:59.789424", + "step": 1770, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39721089601516724, + "timestamp": "2025-09-05 09:02:59.791671", + "step": 1771, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:00.085156", + "step": 1771, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4141196608543396, + "timestamp": "2025-09-05 09:03:00.094213", + "step": 1772, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:00.255930", + "step": 1772, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3068144917488098, + "timestamp": "2025-09-05 09:03:00.257769", + "step": 1773, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:00.423547", + "step": 1773, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2862834632396698, + "timestamp": "2025-09-05 09:03:00.426909", + "step": 1774, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:00.636780", + "step": 1774, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5047730207443237, + "timestamp": "2025-09-05 09:03:00.639450", + "step": 1775, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:00.890445", + "step": 1775, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3867059648036957, + "timestamp": "2025-09-05 09:03:00.907081", + "step": 1776, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:01.102574", + "step": 1776, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32131898403167725, + "timestamp": "2025-09-05 09:03:01.104756", + "step": 1777, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:01.301984", + "step": 1777, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26770636439323425, + "timestamp": "2025-09-05 09:03:01.304878", + "step": 1778, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:01.554677", + "step": 1778, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.15419012308120728, + "timestamp": "2025-09-05 09:03:01.556780", + "step": 1779, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:01.752071", + "step": 1779, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4039864242076874, + "timestamp": "2025-09-05 09:03:01.766230", + "step": 1780, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:07.403589", + "step": 1780, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.678229296229425, + "timestamp": "2025-09-05 09:03:07.406101", + "step": 1780, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:07.567589", + "step": 1780, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33174604177474976, + "timestamp": "2025-09-05 09:03:07.569717", + "step": 1781, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:07.776051", + "step": 1781, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36827021837234497, + "timestamp": "2025-09-05 09:03:07.778234", + "step": 1782, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:07.984836", + "step": 1782, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23714663088321686, + "timestamp": "2025-09-05 09:03:07.986846", + "step": 1783, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:08.184589", + "step": 1783, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34843164682388306, + "timestamp": "2025-09-05 09:03:08.198645", + "step": 1784, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:08.387243", + "step": 1784, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33095985651016235, + "timestamp": "2025-09-05 09:03:08.388990", + "step": 1785, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:08.554852", + "step": 1785, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34028369188308716, + "timestamp": "2025-09-05 09:03:08.559697", + "step": 1786, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:08.766857", + "step": 1786, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25933557748794556, + "timestamp": "2025-09-05 09:03:08.770226", + "step": 1787, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:09.021745", + "step": 1787, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3989739716053009, + "timestamp": "2025-09-05 09:03:09.035974", + "step": 1788, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:09.224379", + "step": 1788, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38663944602012634, + "timestamp": "2025-09-05 09:03:09.227234", + "step": 1789, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:09.430888", + "step": 1789, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17637419700622559, + "timestamp": "2025-09-05 09:03:09.432645", + "step": 1790, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:09.628726", + "step": 1790, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2469310164451599, + "timestamp": "2025-09-05 09:03:09.630565", + "step": 1791, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:09.826584", + "step": 1791, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3839358687400818, + "timestamp": "2025-09-05 09:03:09.840975", + "step": 1792, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:10.071018", + "step": 1792, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36574140191078186, + "timestamp": "2025-09-05 09:03:10.114734", + "step": 1793, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:10.319121", + "step": 1793, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3396137058734894, + "timestamp": "2025-09-05 09:03:10.321423", + "step": 1794, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:10.528211", + "step": 1794, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3768261671066284, + "timestamp": "2025-09-05 09:03:10.530196", + "step": 1795, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:10.775691", + "step": 1795, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3471960723400116, + "timestamp": "2025-09-05 09:03:10.791006", + "step": 1796, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:10.980600", + "step": 1796, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4404922425746918, + "timestamp": "2025-09-05 09:03:10.982368", + "step": 1797, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:03:11.274310", + "step": 1797, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2796347141265869, + "timestamp": "2025-09-05 09:03:11.276786", + "step": 1798, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:11.474461", + "step": 1798, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3296290338039398, + "timestamp": "2025-09-05 09:03:11.476700", + "step": 1799, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:11.682812", + "step": 1799, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2389107197523117, + "timestamp": "2025-09-05 09:03:11.699218", + "step": 1800, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:16.674241", + "step": 1800, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.036339384505084, + "timestamp": "2025-09-05 09:03:16.676009", + "step": 1800, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1800", + "timestamp": "2025-09-05 09:03:17.178926", + "step": 1800, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:17.379411", + "step": 1800, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32822126150131226, + "timestamp": "2025-09-05 09:03:17.382120", + "step": 1801, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:17.578801", + "step": 1801, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2833464443683624, + "timestamp": "2025-09-05 09:03:17.581895", + "step": 1802, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:17.781822", + "step": 1802, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3650655448436737, + "timestamp": "2025-09-05 09:03:17.784468", + "step": 1803, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:18.050377", + "step": 1803, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3336227238178253, + "timestamp": "2025-09-05 09:03:18.066982", + "step": 1804, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:18.267529", + "step": 1804, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3089102804660797, + "timestamp": "2025-09-05 09:03:18.270031", + "step": 1805, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:18.468268", + "step": 1805, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2924884259700775, + "timestamp": "2025-09-05 09:03:18.470148", + "step": 1806, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:18.665728", + "step": 1806, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25991401076316833, + "timestamp": "2025-09-05 09:03:18.667725", + "step": 1807, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:18.872934", + "step": 1807, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38074877858161926, + "timestamp": "2025-09-05 09:03:18.887103", + "step": 1808, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:19.075301", + "step": 1808, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2662460505962372, + "timestamp": "2025-09-05 09:03:19.077492", + "step": 1809, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:19.283635", + "step": 1809, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3200852572917938, + "timestamp": "2025-09-05 09:03:19.285866", + "step": 1810, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:19.514373", + "step": 1810, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23158332705497742, + "timestamp": "2025-09-05 09:03:19.516172", + "step": 1811, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:19.714236", + "step": 1811, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38919728994369507, + "timestamp": "2025-09-05 09:03:19.723741", + "step": 1812, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:19.892125", + "step": 1812, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38500481843948364, + "timestamp": "2025-09-05 09:03:19.894564", + "step": 1813, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:20.062946", + "step": 1813, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4389552175998688, + "timestamp": "2025-09-05 09:03:20.065351", + "step": 1814, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:20.262580", + "step": 1814, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4390960931777954, + "timestamp": "2025-09-05 09:03:20.265034", + "step": 1815, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:20.432565", + "step": 1815, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3098478615283966, + "timestamp": "2025-09-05 09:03:20.448133", + "step": 1816, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:20.639359", + "step": 1816, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28565704822540283, + "timestamp": "2025-09-05 09:03:20.641294", + "step": 1817, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:20.850568", + "step": 1817, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47005581855773926, + "timestamp": "2025-09-05 09:03:20.852541", + "step": 1818, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:21.020394", + "step": 1818, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38893210887908936, + "timestamp": "2025-09-05 09:03:21.022791", + "step": 1819, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:21.219082", + "step": 1819, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26181450486183167, + "timestamp": "2025-09-05 09:03:21.228380", + "step": 1820, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:25.920801", + "step": 1820, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.33664731691632, + "timestamp": "2025-09-05 09:03:25.923217", + "step": 1820, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:26.088428", + "step": 1820, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4290368854999542, + "timestamp": "2025-09-05 09:03:26.090971", + "step": 1821, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:26.256546", + "step": 1821, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3403278887271881, + "timestamp": "2025-09-05 09:03:26.272384", + "step": 1822, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:26.478670", + "step": 1822, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35543927550315857, + "timestamp": "2025-09-05 09:03:26.481017", + "step": 1823, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:26.678647", + "step": 1823, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36923372745513916, + "timestamp": "2025-09-05 09:03:26.696460", + "step": 1824, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:26.894759", + "step": 1824, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33264076709747314, + "timestamp": "2025-09-05 09:03:26.898813", + "step": 1825, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:27.095284", + "step": 1825, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2916991710662842, + "timestamp": "2025-09-05 09:03:27.097313", + "step": 1826, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:27.294793", + "step": 1826, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.339844286441803, + "timestamp": "2025-09-05 09:03:27.296700", + "step": 1827, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:27.492683", + "step": 1827, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25583401322364807, + "timestamp": "2025-09-05 09:03:27.501965", + "step": 1828, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:27.667250", + "step": 1828, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2994399070739746, + "timestamp": "2025-09-05 09:03:27.669072", + "step": 1829, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:27.875984", + "step": 1829, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2248479723930359, + "timestamp": "2025-09-05 09:03:27.877941", + "step": 1830, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:28.074462", + "step": 1830, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44062700867652893, + "timestamp": "2025-09-05 09:03:28.076844", + "step": 1831, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:28.284610", + "step": 1831, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34865128993988037, + "timestamp": "2025-09-05 09:03:28.294083", + "step": 1832, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:28.457941", + "step": 1832, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3390546441078186, + "timestamp": "2025-09-05 09:03:28.459806", + "step": 1833, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:28.625641", + "step": 1833, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4029175639152527, + "timestamp": "2025-09-05 09:03:28.627560", + "step": 1834, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:28.833445", + "step": 1834, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29427599906921387, + "timestamp": "2025-09-05 09:03:28.835361", + "step": 1835, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:29.042402", + "step": 1835, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36329180002212524, + "timestamp": "2025-09-05 09:03:29.059551", + "step": 1836, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:29.261899", + "step": 1836, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24370358884334564, + "timestamp": "2025-09-05 09:03:29.264242", + "step": 1837, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:03:29.435765", + "step": 1837, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4641832411289215, + "timestamp": "2025-09-05 09:03:29.438505", + "step": 1838, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:29.645590", + "step": 1838, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27168017625808716, + "timestamp": "2025-09-05 09:03:29.648925", + "step": 1839, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:29.898200", + "step": 1839, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2619711458683014, + "timestamp": "2025-09-05 09:03:29.964385", + "step": 1840, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:34.650912", + "step": 1840, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.72341477187909, + "timestamp": "2025-09-05 09:03:34.653569", + "step": 1840, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1840", + "timestamp": "2025-09-05 09:03:35.179728", + "step": 1840, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:35.347722", + "step": 1840, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3577501177787781, + "timestamp": "2025-09-05 09:03:35.349827", + "step": 1841, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:35.556211", + "step": 1841, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3645785450935364, + "timestamp": "2025-09-05 09:03:35.558862", + "step": 1842, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:35.726141", + "step": 1842, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.356271892786026, + "timestamp": "2025-09-05 09:03:35.728901", + "step": 1843, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:35.932810", + "step": 1843, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2203180342912674, + "timestamp": "2025-09-05 09:03:35.942768", + "step": 1844, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:36.107321", + "step": 1844, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4093642234802246, + "timestamp": "2025-09-05 09:03:36.109424", + "step": 1845, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:36.277831", + "step": 1845, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2778435945510864, + "timestamp": "2025-09-05 09:03:36.280627", + "step": 1846, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:36.449396", + "step": 1846, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3252573013305664, + "timestamp": "2025-09-05 09:03:36.452403", + "step": 1847, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:03:36.659098", + "step": 1847, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41045743227005005, + "timestamp": "2025-09-05 09:03:36.668901", + "step": 1848, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:36.833074", + "step": 1848, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38269633054733276, + "timestamp": "2025-09-05 09:03:36.836355", + "step": 1849, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:37.005419", + "step": 1849, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46076807379722595, + "timestamp": "2025-09-05 09:03:37.007421", + "step": 1850, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:37.174049", + "step": 1850, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40671306848526, + "timestamp": "2025-09-05 09:03:37.176204", + "step": 1851, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:37.373674", + "step": 1851, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19898702204227448, + "timestamp": "2025-09-05 09:03:37.384578", + "step": 1852, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:37.547587", + "step": 1852, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21051372587680817, + "timestamp": "2025-09-05 09:03:37.549476", + "step": 1853, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:03:37.758174", + "step": 1853, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17444637417793274, + "timestamp": "2025-09-05 09:03:37.760015", + "step": 1854, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:38.011527", + "step": 1854, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25767529010772705, + "timestamp": "2025-09-05 09:03:38.013700", + "step": 1855, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:38.211086", + "step": 1855, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.283276230096817, + "timestamp": "2025-09-05 09:03:38.220740", + "step": 1856, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:38.384710", + "step": 1856, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36334753036499023, + "timestamp": "2025-09-05 09:03:38.387474", + "step": 1857, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:38.555920", + "step": 1857, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24148380756378174, + "timestamp": "2025-09-05 09:03:38.558779", + "step": 1858, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:03:38.758088", + "step": 1858, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2839478552341461, + "timestamp": "2025-09-05 09:03:38.760496", + "step": 1859, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:38.956342", + "step": 1859, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2954237163066864, + "timestamp": "2025-09-05 09:03:38.973295", + "step": 1860, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:43.752815", + "step": 1860, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.779841573659716, + "timestamp": "2025-09-05 09:03:43.755396", + "step": 1860, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:43.917975", + "step": 1860, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25287219882011414, + "timestamp": "2025-09-05 09:03:43.920185", + "step": 1861, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:44.125292", + "step": 1861, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36329612135887146, + "timestamp": "2025-09-05 09:03:44.127298", + "step": 1862, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:44.334342", + "step": 1862, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.214580699801445, + "timestamp": "2025-09-05 09:03:44.336143", + "step": 1863, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:44.543600", + "step": 1863, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32577669620513916, + "timestamp": "2025-09-05 09:03:44.560339", + "step": 1864, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:44.759134", + "step": 1864, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45734408497810364, + "timestamp": "2025-09-05 09:03:44.761720", + "step": 1865, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:44.967935", + "step": 1865, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3126198947429657, + "timestamp": "2025-09-05 09:03:44.970550", + "step": 1866, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:45.140680", + "step": 1866, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2847732603549957, + "timestamp": "2025-09-05 09:03:45.142554", + "step": 1867, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:45.349745", + "step": 1867, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25974026322364807, + "timestamp": "2025-09-05 09:03:45.359453", + "step": 1868, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:45.559114", + "step": 1868, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3027217388153076, + "timestamp": "2025-09-05 09:03:45.561007", + "step": 1869, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:45.767401", + "step": 1869, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2498902678489685, + "timestamp": "2025-09-05 09:03:45.769305", + "step": 1870, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:45.966695", + "step": 1870, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3127624988555908, + "timestamp": "2025-09-05 09:03:45.969508", + "step": 1871, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:46.181681", + "step": 1871, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.298271507024765, + "timestamp": "2025-09-05 09:03:46.199839", + "step": 1872, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:46.481768", + "step": 1872, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31834274530410767, + "timestamp": "2025-09-05 09:03:46.483757", + "step": 1873, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:46.783137", + "step": 1873, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.264957994222641, + "timestamp": "2025-09-05 09:03:46.799236", + "step": 1874, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:47.018791", + "step": 1874, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43002596497535706, + "timestamp": "2025-09-05 09:03:47.020698", + "step": 1875, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:47.227210", + "step": 1875, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4498365819454193, + "timestamp": "2025-09-05 09:03:47.242075", + "step": 1876, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:47.496978", + "step": 1876, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21851421892642975, + "timestamp": "2025-09-05 09:03:47.498804", + "step": 1877, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:47.665836", + "step": 1877, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3184349238872528, + "timestamp": "2025-09-05 09:03:47.667876", + "step": 1878, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:47.871856", + "step": 1878, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33598777651786804, + "timestamp": "2025-09-05 09:03:47.873933", + "step": 1879, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:48.041864", + "step": 1879, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31021371483802795, + "timestamp": "2025-09-05 09:03:48.058333", + "step": 1880, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:03:52.719413", + "step": 1880, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.9173870399914, + "timestamp": "2025-09-05 09:03:52.721544", + "step": 1880, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1880", + "timestamp": "2025-09-05 09:03:53.184102", + "step": 1880, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:53.347637", + "step": 1880, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35257551074028015, + "timestamp": "2025-09-05 09:03:53.349442", + "step": 1881, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:53.516039", + "step": 1881, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41870570182800293, + "timestamp": "2025-09-05 09:03:53.518151", + "step": 1882, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:53.685692", + "step": 1882, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33617204427719116, + "timestamp": "2025-09-05 09:03:53.688722", + "step": 1883, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:53.883855", + "step": 1883, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4209844172000885, + "timestamp": "2025-09-05 09:03:53.899544", + "step": 1884, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:54.089537", + "step": 1884, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27658504247665405, + "timestamp": "2025-09-05 09:03:54.097167", + "step": 1885, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:54.432684", + "step": 1885, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30306634306907654, + "timestamp": "2025-09-05 09:03:54.434610", + "step": 1886, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:54.601164", + "step": 1886, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3418419361114502, + "timestamp": "2025-09-05 09:03:54.603665", + "step": 1887, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:54.806396", + "step": 1887, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24061523377895355, + "timestamp": "2025-09-05 09:03:54.822115", + "step": 1888, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:55.010753", + "step": 1888, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3230704665184021, + "timestamp": "2025-09-05 09:03:55.013738", + "step": 1889, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:55.218951", + "step": 1889, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28835007548332214, + "timestamp": "2025-09-05 09:03:55.221346", + "step": 1890, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:55.387695", + "step": 1890, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41431716084480286, + "timestamp": "2025-09-05 09:03:55.390377", + "step": 1891, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:03:55.586573", + "step": 1891, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19797471165657043, + "timestamp": "2025-09-05 09:03:55.597127", + "step": 1892, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:55.759697", + "step": 1892, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3431963324546814, + "timestamp": "2025-09-05 09:03:55.762708", + "step": 1893, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:55.971229", + "step": 1893, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3090471923351288, + "timestamp": "2025-09-05 09:03:55.973593", + "step": 1894, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:03:56.142593", + "step": 1894, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2430524230003357, + "timestamp": "2025-09-05 09:03:56.145717", + "step": 1895, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:56.352947", + "step": 1895, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23343847692012787, + "timestamp": "2025-09-05 09:03:56.365935", + "step": 1896, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:56.529315", + "step": 1896, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3793867826461792, + "timestamp": "2025-09-05 09:03:56.531878", + "step": 1897, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:03:56.698816", + "step": 1897, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37744349241256714, + "timestamp": "2025-09-05 09:03:56.701645", + "step": 1898, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:03:56.871278", + "step": 1898, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22595612704753876, + "timestamp": "2025-09-05 09:03:56.874840", + "step": 1899, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:03:57.081380", + "step": 1899, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25911441445350647, + "timestamp": "2025-09-05 09:03:57.096490", + "step": 1900, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:01.883995", + "step": 1900, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.85029067024223, + "timestamp": "2025-09-05 09:04:01.886021", + "step": 1900, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:02.049119", + "step": 1900, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31988221406936646, + "timestamp": "2025-09-05 09:04:02.050787", + "step": 1901, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:02.256430", + "step": 1901, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2469642162322998, + "timestamp": "2025-09-05 09:04:02.258336", + "step": 1902, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:02.428352", + "step": 1902, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35464945435523987, + "timestamp": "2025-09-05 09:04:02.430549", + "step": 1903, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:02.629860", + "step": 1903, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4780825972557068, + "timestamp": "2025-09-05 09:04:02.640245", + "step": 1904, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:04:02.801940", + "step": 1904, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2757084369659424, + "timestamp": "2025-09-05 09:04:02.803938", + "step": 1905, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:02.972141", + "step": 1905, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1613525003194809, + "timestamp": "2025-09-05 09:04:02.974528", + "step": 1906, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:03.181297", + "step": 1906, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36126402020454407, + "timestamp": "2025-09-05 09:04:03.183198", + "step": 1907, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:03.355002", + "step": 1907, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.49422934651374817, + "timestamp": "2025-09-05 09:04:03.370219", + "step": 1908, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:03.563094", + "step": 1908, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40730562806129456, + "timestamp": "2025-09-05 09:04:03.564993", + "step": 1909, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:03.733709", + "step": 1909, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3607218265533447, + "timestamp": "2025-09-05 09:04:03.736360", + "step": 1910, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:03.931318", + "step": 1910, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3690507113933563, + "timestamp": "2025-09-05 09:04:03.933114", + "step": 1911, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:04.104801", + "step": 1911, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2338135987520218, + "timestamp": "2025-09-05 09:04:04.114485", + "step": 1912, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:04.278006", + "step": 1912, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.46766653656959534, + "timestamp": "2025-09-05 09:04:04.279765", + "step": 1913, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:04.449674", + "step": 1913, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36028924584388733, + "timestamp": "2025-09-05 09:04:04.451446", + "step": 1914, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:04.618848", + "step": 1914, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4140165448188782, + "timestamp": "2025-09-05 09:04:04.621294", + "step": 1915, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:04.819558", + "step": 1915, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2273964136838913, + "timestamp": "2025-09-05 09:04:04.828828", + "step": 1916, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:04.994269", + "step": 1916, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3443259596824646, + "timestamp": "2025-09-05 09:04:04.997681", + "step": 1917, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:05.165754", + "step": 1917, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30746108293533325, + "timestamp": "2025-09-05 09:04:05.167548", + "step": 1918, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:05.335215", + "step": 1918, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2145860493183136, + "timestamp": "2025-09-05 09:04:05.338280", + "step": 1919, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:05.544231", + "step": 1919, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3346792459487915, + "timestamp": "2025-09-05 09:04:05.558510", + "step": 1920, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:10.351935", + "step": 1920, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.15066865394947, + "timestamp": "2025-09-05 09:04:10.353809", + "step": 1920, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1920", + "timestamp": "2025-09-05 09:04:10.783963", + "step": 1920, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:10.952777", + "step": 1920, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34421074390411377, + "timestamp": "2025-09-05 09:04:10.954701", + "step": 1921, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:11.123289", + "step": 1921, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27249059081077576, + "timestamp": "2025-09-05 09:04:11.125101", + "step": 1922, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:11.292348", + "step": 1922, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2681417465209961, + "timestamp": "2025-09-05 09:04:11.295915", + "step": 1923, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:11.506679", + "step": 1923, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4062962234020233, + "timestamp": "2025-09-05 09:04:11.516326", + "step": 1924, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:11.678786", + "step": 1924, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3572784960269928, + "timestamp": "2025-09-05 09:04:11.680783", + "step": 1925, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:11.848575", + "step": 1925, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3298678994178772, + "timestamp": "2025-09-05 09:04:11.850534", + "step": 1926, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:12.045465", + "step": 1926, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17969104647636414, + "timestamp": "2025-09-05 09:04:12.047081", + "step": 1927, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:12.213555", + "step": 1927, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27280840277671814, + "timestamp": "2025-09-05 09:04:12.227905", + "step": 1928, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:12.417495", + "step": 1928, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3779914379119873, + "timestamp": "2025-09-05 09:04:12.420129", + "step": 1929, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:12.587001", + "step": 1929, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32336172461509705, + "timestamp": "2025-09-05 09:04:12.589142", + "step": 1930, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:12.796003", + "step": 1930, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5328038930892944, + "timestamp": "2025-09-05 09:04:12.802014", + "step": 1931, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:13.012545", + "step": 1931, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2591715157032013, + "timestamp": "2025-09-05 09:04:13.022339", + "step": 1932, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:13.186085", + "step": 1932, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39072778820991516, + "timestamp": "2025-09-05 09:04:13.188276", + "step": 1933, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:13.357480", + "step": 1933, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20760498940944672, + "timestamp": "2025-09-05 09:04:13.359460", + "step": 1934, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:13.528291", + "step": 1934, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2971227467060089, + "timestamp": "2025-09-05 09:04:13.530536", + "step": 1935, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:13.728107", + "step": 1935, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3519618809223175, + "timestamp": "2025-09-05 09:04:13.743615", + "step": 1936, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:13.935962", + "step": 1936, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4732755720615387, + "timestamp": "2025-09-05 09:04:13.937754", + "step": 1937, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:14.144175", + "step": 1937, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22129914164543152, + "timestamp": "2025-09-05 09:04:14.146537", + "step": 1938, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:14.315053", + "step": 1938, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27725905179977417, + "timestamp": "2025-09-05 09:04:14.317471", + "step": 1939, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:14.483480", + "step": 1939, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30071911215782166, + "timestamp": "2025-09-05 09:04:14.498059", + "step": 1940, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:19.889789", + "step": 1940, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.40086004465284, + "timestamp": "2025-09-05 09:04:19.897148", + "step": 1940, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:20.060950", + "step": 1940, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47121545672416687, + "timestamp": "2025-09-05 09:04:20.063252", + "step": 1941, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:20.230989", + "step": 1941, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2984376847743988, + "timestamp": "2025-09-05 09:04:20.233513", + "step": 1942, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:20.440935", + "step": 1942, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2947276532649994, + "timestamp": "2025-09-05 09:04:20.443535", + "step": 1943, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:20.611995", + "step": 1943, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3717845380306244, + "timestamp": "2025-09-05 09:04:20.628113", + "step": 1944, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:20.816134", + "step": 1944, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27086225152015686, + "timestamp": "2025-09-05 09:04:20.818437", + "step": 1945, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:20.986429", + "step": 1945, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36322546005249023, + "timestamp": "2025-09-05 09:04:20.989100", + "step": 1946, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:21.186891", + "step": 1946, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3279218077659607, + "timestamp": "2025-09-05 09:04:21.188653", + "step": 1947, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:21.356516", + "step": 1947, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3289004862308502, + "timestamp": "2025-09-05 09:04:21.371172", + "step": 1948, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:21.559646", + "step": 1948, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23576058447360992, + "timestamp": "2025-09-05 09:04:21.561647", + "step": 1949, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:21.767476", + "step": 1949, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4314030408859253, + "timestamp": "2025-09-05 09:04:21.769580", + "step": 1950, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:21.975210", + "step": 1950, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2590286135673523, + "timestamp": "2025-09-05 09:04:21.977268", + "step": 1951, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:22.143053", + "step": 1951, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3938618004322052, + "timestamp": "2025-09-05 09:04:22.158294", + "step": 1952, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:22.380932", + "step": 1952, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41767221689224243, + "timestamp": "2025-09-05 09:04:22.383122", + "step": 1953, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:22.602652", + "step": 1953, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25870412588119507, + "timestamp": "2025-09-05 09:04:22.605208", + "step": 1954, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:22.771168", + "step": 1954, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37756073474884033, + "timestamp": "2025-09-05 09:04:22.774225", + "step": 1955, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:22.982874", + "step": 1955, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3379029631614685, + "timestamp": "2025-09-05 09:04:22.993241", + "step": 1956, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:23.157972", + "step": 1956, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26806700229644775, + "timestamp": "2025-09-05 09:04:23.160348", + "step": 1957, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:23.326717", + "step": 1957, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2842908203601837, + "timestamp": "2025-09-05 09:04:23.329832", + "step": 1958, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:23.497903", + "step": 1958, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45978856086730957, + "timestamp": "2025-09-05 09:04:23.500376", + "step": 1959, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:23.698055", + "step": 1959, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23071345686912537, + "timestamp": "2025-09-05 09:04:23.708192", + "step": 1960, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:28.419098", + "step": 1960, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.53060956108376, + "timestamp": "2025-09-05 09:04:28.421195", + "step": 1960, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1960", + "timestamp": "2025-09-05 09:04:28.888174", + "step": 1960, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:29.072136", + "step": 1960, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2766428589820862, + "timestamp": "2025-09-05 09:04:29.074140", + "step": 1961, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:29.275781", + "step": 1961, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4845949113368988, + "timestamp": "2025-09-05 09:04:29.277972", + "step": 1962, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:29.473812", + "step": 1962, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28940051794052124, + "timestamp": "2025-09-05 09:04:29.475571", + "step": 1963, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:29.642041", + "step": 1963, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3159952163696289, + "timestamp": "2025-09-05 09:04:29.659688", + "step": 1964, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:04:29.853746", + "step": 1964, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3902105391025543, + "timestamp": "2025-09-05 09:04:29.855230", + "step": 1965, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:30.058508", + "step": 1965, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2579273283481598, + "timestamp": "2025-09-05 09:04:30.060888", + "step": 1966, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:30.260907", + "step": 1966, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28875139355659485, + "timestamp": "2025-09-05 09:04:30.263070", + "step": 1967, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:30.530524", + "step": 1967, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3324761390686035, + "timestamp": "2025-09-05 09:04:30.540760", + "step": 1968, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:30.707202", + "step": 1968, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3139239251613617, + "timestamp": "2025-09-05 09:04:30.709065", + "step": 1969, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:30.877108", + "step": 1969, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2467249482870102, + "timestamp": "2025-09-05 09:04:30.879338", + "step": 1970, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:31.085452", + "step": 1970, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27524515986442566, + "timestamp": "2025-09-05 09:04:31.087634", + "step": 1971, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:31.294610", + "step": 1971, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2604565918445587, + "timestamp": "2025-09-05 09:04:31.304908", + "step": 1972, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:31.468728", + "step": 1972, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3006044030189514, + "timestamp": "2025-09-05 09:04:31.470507", + "step": 1973, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:31.656392", + "step": 1973, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3287713825702667, + "timestamp": "2025-09-05 09:04:31.658541", + "step": 1974, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:31.857546", + "step": 1974, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3500474989414215, + "timestamp": "2025-09-05 09:04:31.860148", + "step": 1975, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:32.066595", + "step": 1975, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24871431291103363, + "timestamp": "2025-09-05 09:04:32.081341", + "step": 1976, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:32.271920", + "step": 1976, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3919868469238281, + "timestamp": "2025-09-05 09:04:32.273912", + "step": 1977, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:32.440681", + "step": 1977, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2948307693004608, + "timestamp": "2025-09-05 09:04:32.442952", + "step": 1978, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:32.648995", + "step": 1978, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4514521062374115, + "timestamp": "2025-09-05 09:04:32.651170", + "step": 1979, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:04:32.856832", + "step": 1979, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3044537305831909, + "timestamp": "2025-09-05 09:04:32.871644", + "step": 1980, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:37.538105", + "step": 1980, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.71036435354335, + "timestamp": "2025-09-05 09:04:37.539835", + "step": 1980, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:37.703241", + "step": 1980, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29383978247642517, + "timestamp": "2025-09-05 09:04:37.705202", + "step": 1981, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:37.873478", + "step": 1981, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4132064878940582, + "timestamp": "2025-09-05 09:04:37.876394", + "step": 1982, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:38.043828", + "step": 1982, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1803075671195984, + "timestamp": "2025-09-05 09:04:38.046427", + "step": 1983, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:38.241865", + "step": 1983, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.411940336227417, + "timestamp": "2025-09-05 09:04:38.257614", + "step": 1984, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:04:38.446287", + "step": 1984, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3590596616268158, + "timestamp": "2025-09-05 09:04:38.448991", + "step": 1985, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:38.645695", + "step": 1985, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28954413533210754, + "timestamp": "2025-09-05 09:04:38.647436", + "step": 1986, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:04:38.812350", + "step": 1986, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25408488512039185, + "timestamp": "2025-09-05 09:04:38.814460", + "step": 1987, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:39.009224", + "step": 1987, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.18480801582336426, + "timestamp": "2025-09-05 09:04:39.018433", + "step": 1988, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:39.182413", + "step": 1988, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.305034339427948, + "timestamp": "2025-09-05 09:04:39.184267", + "step": 1989, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:39.390607", + "step": 1989, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23244498670101166, + "timestamp": "2025-09-05 09:04:39.392494", + "step": 1990, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:39.598816", + "step": 1990, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40569478273391724, + "timestamp": "2025-09-05 09:04:39.600817", + "step": 1991, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:39.768888", + "step": 1991, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3343004882335663, + "timestamp": "2025-09-05 09:04:39.784051", + "step": 1992, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:39.972490", + "step": 1992, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2612588405609131, + "timestamp": "2025-09-05 09:04:39.974222", + "step": 1993, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:04:40.142246", + "step": 1993, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3457273542881012, + "timestamp": "2025-09-05 09:04:40.144612", + "step": 1994, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:40.311740", + "step": 1994, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32286980748176575, + "timestamp": "2025-09-05 09:04:40.314822", + "step": 1995, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:40.521126", + "step": 1995, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32212406396865845, + "timestamp": "2025-09-05 09:04:40.530489", + "step": 1996, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:40.694811", + "step": 1996, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35238468647003174, + "timestamp": "2025-09-05 09:04:40.697027", + "step": 1997, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:40.864697", + "step": 1997, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3432472050189972, + "timestamp": "2025-09-05 09:04:40.866865", + "step": 1998, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:41.064125", + "step": 1998, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30503416061401367, + "timestamp": "2025-09-05 09:04:41.066609", + "step": 1999, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:41.234458", + "step": 1999, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26616600155830383, + "timestamp": "2025-09-05 09:04:41.248958", + "step": 2000, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:46.035115", + "step": 2000, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.183245029940046, + "timestamp": "2025-09-05 09:04:46.039931", + "step": 2000, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2000", + "timestamp": "2025-09-05 09:04:46.512943", + "step": 2000, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:46.680863", + "step": 2000, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27458614110946655, + "timestamp": "2025-09-05 09:04:46.683153", + "step": 2001, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:46.878428", + "step": 2001, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3308562934398651, + "timestamp": "2025-09-05 09:04:46.880173", + "step": 2002, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:47.085991", + "step": 2002, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4302447736263275, + "timestamp": "2025-09-05 09:04:47.089966", + "step": 2003, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:47.289790", + "step": 2003, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23679831624031067, + "timestamp": "2025-09-05 09:04:47.299469", + "step": 2004, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:47.462911", + "step": 2004, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3112923204898834, + "timestamp": "2025-09-05 09:04:47.465129", + "step": 2005, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:47.632457", + "step": 2005, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30558183789253235, + "timestamp": "2025-09-05 09:04:47.634451", + "step": 2006, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:47.839869", + "step": 2006, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3617997467517853, + "timestamp": "2025-09-05 09:04:47.842181", + "step": 2007, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:48.008505", + "step": 2007, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23063942790031433, + "timestamp": "2025-09-05 09:04:48.025193", + "step": 2008, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:48.221927", + "step": 2008, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36768314242362976, + "timestamp": "2025-09-05 09:04:48.224032", + "step": 2009, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:48.391869", + "step": 2009, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3346289396286011, + "timestamp": "2025-09-05 09:04:48.393707", + "step": 2010, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:04:48.590531", + "step": 2010, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3734259605407715, + "timestamp": "2025-09-05 09:04:48.595428", + "step": 2011, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:48.810951", + "step": 2011, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3368525505065918, + "timestamp": "2025-09-05 09:04:48.825300", + "step": 2012, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:49.013722", + "step": 2012, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3550032377243042, + "timestamp": "2025-09-05 09:04:49.015412", + "step": 2013, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:49.220262", + "step": 2013, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42761796712875366, + "timestamp": "2025-09-05 09:04:49.222077", + "step": 2014, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:49.428047", + "step": 2014, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23654963076114655, + "timestamp": "2025-09-05 09:04:49.430134", + "step": 2015, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:49.596867", + "step": 2015, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2988823354244232, + "timestamp": "2025-09-05 09:04:49.611145", + "step": 2016, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:49.799003", + "step": 2016, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.265523761510849, + "timestamp": "2025-09-05 09:04:49.800983", + "step": 2017, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:50.008530", + "step": 2017, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4000529944896698, + "timestamp": "2025-09-05 09:04:50.010746", + "step": 2018, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:50.178544", + "step": 2018, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24551428854465485, + "timestamp": "2025-09-05 09:04:50.180673", + "step": 2019, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:50.380479", + "step": 2019, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.15637749433517456, + "timestamp": "2025-09-05 09:04:50.390664", + "step": 2020, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:04:55.197240", + "step": 2020, + "epoch": 2 + }, + { + "type": "pplx", + "content": 57.22559411581902, + "timestamp": "2025-09-05 09:04:55.199506", + "step": 2020, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:55.360725", + "step": 2020, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29269856214523315, + "timestamp": "2025-09-05 09:04:55.362490", + "step": 2021, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:55.532762", + "step": 2021, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40585339069366455, + "timestamp": "2025-09-05 09:04:55.535125", + "step": 2022, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:55.741574", + "step": 2022, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4142700135707855, + "timestamp": "2025-09-05 09:04:55.743607", + "step": 2023, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:55.911896", + "step": 2023, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3398078680038452, + "timestamp": "2025-09-05 09:04:55.927449", + "step": 2024, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:56.115575", + "step": 2024, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27966639399528503, + "timestamp": "2025-09-05 09:04:56.117340", + "step": 2025, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:56.283716", + "step": 2025, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3459966480731964, + "timestamp": "2025-09-05 09:04:56.285535", + "step": 2026, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:56.479497", + "step": 2026, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26111602783203125, + "timestamp": "2025-09-05 09:04:56.481512", + "step": 2027, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:56.647710", + "step": 2027, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2696475386619568, + "timestamp": "2025-09-05 09:04:56.663724", + "step": 2028, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:56.855157", + "step": 2028, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44315868616104126, + "timestamp": "2025-09-05 09:04:56.858681", + "step": 2029, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:57.029209", + "step": 2029, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37835097312927246, + "timestamp": "2025-09-05 09:04:57.031189", + "step": 2030, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:04:57.198452", + "step": 2030, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4174140989780426, + "timestamp": "2025-09-05 09:04:57.200571", + "step": 2031, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:57.397154", + "step": 2031, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3270459473133087, + "timestamp": "2025-09-05 09:04:57.407649", + "step": 2032, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:57.569889", + "step": 2032, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22918665409088135, + "timestamp": "2025-09-05 09:04:57.572029", + "step": 2033, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:57.778081", + "step": 2033, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37719905376434326, + "timestamp": "2025-09-05 09:04:57.780047", + "step": 2034, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:04:57.985555", + "step": 2034, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32334667444229126, + "timestamp": "2025-09-05 09:04:57.989649", + "step": 2035, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:58.201644", + "step": 2035, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.343068391084671, + "timestamp": "2025-09-05 09:04:58.259790", + "step": 2036, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:04:58.509098", + "step": 2036, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4084326922893524, + "timestamp": "2025-09-05 09:04:58.511936", + "step": 2037, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:04:58.717269", + "step": 2037, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24380765855312347, + "timestamp": "2025-09-05 09:04:58.720014", + "step": 2038, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:04:58.927965", + "step": 2038, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21494068205356598, + "timestamp": "2025-09-05 09:04:58.930477", + "step": 2039, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:04:59.102037", + "step": 2039, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24519406259059906, + "timestamp": "2025-09-05 09:04:59.112573", + "step": 2040, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:03.749817", + "step": 2040, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.85761600576899, + "timestamp": "2025-09-05 09:05:03.751727", + "step": 2040, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2040", + "timestamp": "2025-09-05 09:05:04.212748", + "step": 2040, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:04.379772", + "step": 2040, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21963070333003998, + "timestamp": "2025-09-05 09:05:04.381772", + "step": 2041, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:04.576449", + "step": 2041, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2468557208776474, + "timestamp": "2025-09-05 09:05:04.580133", + "step": 2042, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:04.775998", + "step": 2042, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38554346561431885, + "timestamp": "2025-09-05 09:05:04.778595", + "step": 2043, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:04.948336", + "step": 2043, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2700977325439453, + "timestamp": "2025-09-05 09:05:04.965970", + "step": 2044, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:05:05.163837", + "step": 2044, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32830366492271423, + "timestamp": "2025-09-05 09:05:05.167027", + "step": 2045, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:05.337485", + "step": 2045, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41252601146698, + "timestamp": "2025-09-05 09:05:05.339559", + "step": 2046, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:05.506711", + "step": 2046, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23501189053058624, + "timestamp": "2025-09-05 09:05:05.509934", + "step": 2047, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:05.676582", + "step": 2047, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.406737744808197, + "timestamp": "2025-09-05 09:05:05.686266", + "step": 2048, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:05.850915", + "step": 2048, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3673907220363617, + "timestamp": "2025-09-05 09:05:05.852489", + "step": 2049, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:06.047152", + "step": 2049, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3338184058666229, + "timestamp": "2025-09-05 09:05:06.050231", + "step": 2050, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:06.254566", + "step": 2050, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3576391935348511, + "timestamp": "2025-09-05 09:05:06.257351", + "step": 2051, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:06.456315", + "step": 2051, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4063916504383087, + "timestamp": "2025-09-05 09:05:06.465686", + "step": 2052, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:06.630051", + "step": 2052, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30287304520606995, + "timestamp": "2025-09-05 09:05:06.632774", + "step": 2053, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:06.801724", + "step": 2053, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31338661909103394, + "timestamp": "2025-09-05 09:05:06.804131", + "step": 2054, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:07.000461", + "step": 2054, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30023157596588135, + "timestamp": "2025-09-05 09:05:07.002435", + "step": 2055, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:07.169863", + "step": 2055, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5755932927131653, + "timestamp": "2025-09-05 09:05:07.187296", + "step": 2056, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:07.385750", + "step": 2056, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44111013412475586, + "timestamp": "2025-09-05 09:05:07.389213", + "step": 2057, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:07.596791", + "step": 2057, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3329983651638031, + "timestamp": "2025-09-05 09:05:07.598769", + "step": 2058, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:07.794782", + "step": 2058, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24375692009925842, + "timestamp": "2025-09-05 09:05:07.797646", + "step": 2059, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:07.965368", + "step": 2059, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27790704369544983, + "timestamp": "2025-09-05 09:05:07.982541", + "step": 2060, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:12.660572", + "step": 2060, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.09090719183764, + "timestamp": "2025-09-05 09:05:12.662701", + "step": 2060, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:12.824471", + "step": 2060, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2383831888437271, + "timestamp": "2025-09-05 09:05:12.826666", + "step": 2061, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:12.993388", + "step": 2061, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3052305579185486, + "timestamp": "2025-09-05 09:05:12.995388", + "step": 2062, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:13.161817", + "step": 2062, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.18823660910129547, + "timestamp": "2025-09-05 09:05:13.166329", + "step": 2063, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:13.365333", + "step": 2063, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28078800439834595, + "timestamp": "2025-09-05 09:05:13.422659", + "step": 2064, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:13.620690", + "step": 2064, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42778801918029785, + "timestamp": "2025-09-05 09:05:13.622539", + "step": 2065, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:13.818849", + "step": 2065, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3101421594619751, + "timestamp": "2025-09-05 09:05:13.821061", + "step": 2066, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:14.017518", + "step": 2066, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27489471435546875, + "timestamp": "2025-09-05 09:05:14.020061", + "step": 2067, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:14.228453", + "step": 2067, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23832768201828003, + "timestamp": "2025-09-05 09:05:14.238406", + "step": 2068, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:14.403227", + "step": 2068, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3375934064388275, + "timestamp": "2025-09-05 09:05:14.405573", + "step": 2069, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:14.610505", + "step": 2069, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3630978763103485, + "timestamp": "2025-09-05 09:05:14.612484", + "step": 2070, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:05:14.779489", + "step": 2070, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3400574326515198, + "timestamp": "2025-09-05 09:05:14.781916", + "step": 2071, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:14.980142", + "step": 2071, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40873268246650696, + "timestamp": "2025-09-05 09:05:14.994736", + "step": 2072, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:15.184588", + "step": 2072, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3188339173793793, + "timestamp": "2025-09-05 09:05:15.187393", + "step": 2073, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:15.356461", + "step": 2073, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2616739869117737, + "timestamp": "2025-09-05 09:05:15.358806", + "step": 2074, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:15.556729", + "step": 2074, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38481009006500244, + "timestamp": "2025-09-05 09:05:15.558582", + "step": 2075, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:15.727982", + "step": 2075, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3673509657382965, + "timestamp": "2025-09-05 09:05:15.737912", + "step": 2076, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:15.904494", + "step": 2076, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34988126158714294, + "timestamp": "2025-09-05 09:05:15.906553", + "step": 2077, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:16.111211", + "step": 2077, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.262952983379364, + "timestamp": "2025-09-05 09:05:16.113704", + "step": 2078, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:16.280634", + "step": 2078, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33671560883522034, + "timestamp": "2025-09-05 09:05:16.283544", + "step": 2079, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:16.478700", + "step": 2079, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35512277483940125, + "timestamp": "2025-09-05 09:05:16.488316", + "step": 2080, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:21.153968", + "step": 2080, + "epoch": 2 + }, + { + "type": "pplx", + "content": 56.30905831955004, + "timestamp": "2025-09-05 09:05:21.155970", + "step": 2080, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2080", + "timestamp": "2025-09-05 09:05:21.636265", + "step": 2080, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:21.805059", + "step": 2080, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3978932201862335, + "timestamp": "2025-09-05 09:05:21.806980", + "step": 2081, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:21.973582", + "step": 2081, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23755325376987457, + "timestamp": "2025-09-05 09:05:21.975776", + "step": 2082, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:22.181802", + "step": 2082, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26282647252082825, + "timestamp": "2025-09-05 09:05:22.184030", + "step": 2083, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:22.390211", + "step": 2083, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22978627681732178, + "timestamp": "2025-09-05 09:05:22.399814", + "step": 2084, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:22.564860", + "step": 2084, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34872758388519287, + "timestamp": "2025-09-05 09:05:22.566428", + "step": 2085, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:22.733852", + "step": 2085, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26038891077041626, + "timestamp": "2025-09-05 09:05:22.735811", + "step": 2086, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:22.941243", + "step": 2086, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2772735357284546, + "timestamp": "2025-09-05 09:05:22.943426", + "step": 2087, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:23.171037", + "step": 2087, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3665330111980438, + "timestamp": "2025-09-05 09:05:23.185480", + "step": 2088, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:23.377188", + "step": 2088, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30773794651031494, + "timestamp": "2025-09-05 09:05:23.378924", + "step": 2089, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:23.547670", + "step": 2089, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3149157464504242, + "timestamp": "2025-09-05 09:05:23.549514", + "step": 2090, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:23.716414", + "step": 2090, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26793622970581055, + "timestamp": "2025-09-05 09:05:23.718590", + "step": 2091, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:23.915903", + "step": 2091, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23315511643886566, + "timestamp": "2025-09-05 09:05:23.930531", + "step": 2092, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:24.126973", + "step": 2092, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33972030878067017, + "timestamp": "2025-09-05 09:05:24.129837", + "step": 2093, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:24.328693", + "step": 2093, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24864479899406433, + "timestamp": "2025-09-05 09:05:24.332201", + "step": 2094, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:24.530634", + "step": 2094, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4089866876602173, + "timestamp": "2025-09-05 09:05:24.532314", + "step": 2095, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:24.700655", + "step": 2095, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36251306533813477, + "timestamp": "2025-09-05 09:05:24.710674", + "step": 2096, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:05:24.877319", + "step": 2096, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36653363704681396, + "timestamp": "2025-09-05 09:05:24.880792", + "step": 2097, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:25.090908", + "step": 2097, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2979757487773895, + "timestamp": "2025-09-05 09:05:25.093136", + "step": 2098, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:25.264142", + "step": 2098, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3006058633327484, + "timestamp": "2025-09-05 09:05:25.266754", + "step": 2099, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:25.476399", + "step": 2099, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3439769148826599, + "timestamp": "2025-09-05 09:05:25.493166", + "step": 2100, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:30.301851", + "step": 2100, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.87515440553129, + "timestamp": "2025-09-05 09:05:30.304097", + "step": 2100, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:05:30.467899", + "step": 2100, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3255309760570526, + "timestamp": "2025-09-05 09:05:30.469795", + "step": 2101, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:30.635767", + "step": 2101, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23818431794643402, + "timestamp": "2025-09-05 09:05:30.637417", + "step": 2102, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:30.804649", + "step": 2102, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2935701310634613, + "timestamp": "2025-09-05 09:05:30.806664", + "step": 2103, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:31.003143", + "step": 2103, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3238224983215332, + "timestamp": "2025-09-05 09:05:31.017915", + "step": 2104, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:31.207656", + "step": 2104, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26048019528388977, + "timestamp": "2025-09-05 09:05:31.209412", + "step": 2105, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:31.417385", + "step": 2105, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3144626319408417, + "timestamp": "2025-09-05 09:05:31.419292", + "step": 2106, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:31.615226", + "step": 2106, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31784558296203613, + "timestamp": "2025-09-05 09:05:31.617380", + "step": 2107, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:31.782250", + "step": 2107, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3896116018295288, + "timestamp": "2025-09-05 09:05:31.799351", + "step": 2108, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:31.999348", + "step": 2108, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5450838208198547, + "timestamp": "2025-09-05 09:05:32.001492", + "step": 2109, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:32.207332", + "step": 2109, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4268830716609955, + "timestamp": "2025-09-05 09:05:32.209190", + "step": 2110, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:32.376126", + "step": 2110, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2802806496620178, + "timestamp": "2025-09-05 09:05:32.379085", + "step": 2111, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:32.582723", + "step": 2111, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3529273271560669, + "timestamp": "2025-09-05 09:05:32.597710", + "step": 2112, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:32.786023", + "step": 2112, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37120527029037476, + "timestamp": "2025-09-05 09:05:32.788315", + "step": 2113, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:32.987392", + "step": 2113, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2828807532787323, + "timestamp": "2025-09-05 09:05:32.989258", + "step": 2114, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:33.197110", + "step": 2114, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3188009262084961, + "timestamp": "2025-09-05 09:05:33.198841", + "step": 2115, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:33.397741", + "step": 2115, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30147457122802734, + "timestamp": "2025-09-05 09:05:33.414853", + "step": 2116, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:33.606839", + "step": 2116, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39994457364082336, + "timestamp": "2025-09-05 09:05:33.609368", + "step": 2117, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:33.814464", + "step": 2117, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29162168502807617, + "timestamp": "2025-09-05 09:05:33.816904", + "step": 2118, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:34.012704", + "step": 2118, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36326169967651367, + "timestamp": "2025-09-05 09:05:34.015173", + "step": 2119, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:34.220510", + "step": 2119, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39839819073677063, + "timestamp": "2025-09-05 09:05:34.238310", + "step": 2120, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:38.962153", + "step": 2120, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.023435362113, + "timestamp": "2025-09-05 09:05:38.964199", + "step": 2120, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2120", + "timestamp": "2025-09-05 09:05:39.426004", + "step": 2120, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:39.590655", + "step": 2120, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.298641175031662, + "timestamp": "2025-09-05 09:05:39.592486", + "step": 2121, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:39.762317", + "step": 2121, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2771815061569214, + "timestamp": "2025-09-05 09:05:39.764288", + "step": 2122, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:39.970270", + "step": 2122, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3446735143661499, + "timestamp": "2025-09-05 09:05:39.973486", + "step": 2123, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:40.142076", + "step": 2123, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32573774456977844, + "timestamp": "2025-09-05 09:05:40.158806", + "step": 2124, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:40.355566", + "step": 2124, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3329891264438629, + "timestamp": "2025-09-05 09:05:40.357398", + "step": 2125, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:40.524554", + "step": 2125, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3262787163257599, + "timestamp": "2025-09-05 09:05:40.526910", + "step": 2126, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:40.724145", + "step": 2126, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28894031047821045, + "timestamp": "2025-09-05 09:05:40.726046", + "step": 2127, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:40.894443", + "step": 2127, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37630510330200195, + "timestamp": "2025-09-05 09:05:40.908812", + "step": 2128, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:41.096804", + "step": 2128, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31172171235084534, + "timestamp": "2025-09-05 09:05:41.098750", + "step": 2129, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:41.294440", + "step": 2129, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3221554756164551, + "timestamp": "2025-09-05 09:05:41.296586", + "step": 2130, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:41.464390", + "step": 2130, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22313398122787476, + "timestamp": "2025-09-05 09:05:41.466652", + "step": 2131, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:41.671802", + "step": 2131, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27833616733551025, + "timestamp": "2025-09-05 09:05:41.688374", + "step": 2132, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:41.888539", + "step": 2132, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35830333828926086, + "timestamp": "2025-09-05 09:05:41.890675", + "step": 2133, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:42.057035", + "step": 2133, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3745828866958618, + "timestamp": "2025-09-05 09:05:42.059027", + "step": 2134, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:42.256318", + "step": 2134, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4300999939441681, + "timestamp": "2025-09-05 09:05:42.258212", + "step": 2135, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:42.425915", + "step": 2135, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4072500169277191, + "timestamp": "2025-09-05 09:05:42.440937", + "step": 2136, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:42.629269", + "step": 2136, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2526228129863739, + "timestamp": "2025-09-05 09:05:42.631217", + "step": 2137, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:42.837475", + "step": 2137, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39318081736564636, + "timestamp": "2025-09-05 09:05:42.839334", + "step": 2138, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:43.005908", + "step": 2138, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2297356128692627, + "timestamp": "2025-09-05 09:05:43.008470", + "step": 2139, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:05:43.206987", + "step": 2139, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33987030386924744, + "timestamp": "2025-09-05 09:05:43.216353", + "step": 2140, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:47.865119", + "step": 2140, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.06809153484284, + "timestamp": "2025-09-05 09:05:47.867422", + "step": 2140, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:48.032484", + "step": 2140, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4749826192855835, + "timestamp": "2025-09-05 09:05:48.034336", + "step": 2141, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:48.200139", + "step": 2141, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28325966000556946, + "timestamp": "2025-09-05 09:05:48.201779", + "step": 2142, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:48.369121", + "step": 2142, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42103084921836853, + "timestamp": "2025-09-05 09:05:48.370850", + "step": 2143, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:48.566526", + "step": 2143, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38202551007270813, + "timestamp": "2025-09-05 09:05:48.576121", + "step": 2144, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:48.738085", + "step": 2144, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3595607578754425, + "timestamp": "2025-09-05 09:05:48.740049", + "step": 2145, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:48.947701", + "step": 2145, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3413144648075104, + "timestamp": "2025-09-05 09:05:48.949394", + "step": 2146, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:49.156351", + "step": 2146, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24180643260478973, + "timestamp": "2025-09-05 09:05:49.157912", + "step": 2147, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:49.325916", + "step": 2147, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2788175344467163, + "timestamp": "2025-09-05 09:05:49.342534", + "step": 2148, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:49.540402", + "step": 2148, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39272475242614746, + "timestamp": "2025-09-05 09:05:49.542346", + "step": 2149, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:49.709083", + "step": 2149, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2623364329338074, + "timestamp": "2025-09-05 09:05:49.711039", + "step": 2150, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:49.919434", + "step": 2150, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21915015578269958, + "timestamp": "2025-09-05 09:05:49.921268", + "step": 2151, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:50.088631", + "step": 2151, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38461872935295105, + "timestamp": "2025-09-05 09:05:50.105713", + "step": 2152, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:50.302548", + "step": 2152, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4068615138530731, + "timestamp": "2025-09-05 09:05:50.304488", + "step": 2153, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:50.472419", + "step": 2153, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3176921606063843, + "timestamp": "2025-09-05 09:05:50.474129", + "step": 2154, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:50.681724", + "step": 2154, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34700101613998413, + "timestamp": "2025-09-05 09:05:50.683454", + "step": 2155, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:50.851655", + "step": 2155, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3761594891548157, + "timestamp": "2025-09-05 09:05:50.869173", + "step": 2156, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:51.068367", + "step": 2156, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1952638030052185, + "timestamp": "2025-09-05 09:05:51.070604", + "step": 2157, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:05:51.320283", + "step": 2157, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4499374330043793, + "timestamp": "2025-09-05 09:05:51.322342", + "step": 2158, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:51.521209", + "step": 2158, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2834685146808624, + "timestamp": "2025-09-05 09:05:51.523072", + "step": 2159, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:51.731601", + "step": 2159, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27654868364334106, + "timestamp": "2025-09-05 09:05:51.745727", + "step": 2160, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:05:57.239086", + "step": 2160, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.48316011694305, + "timestamp": "2025-09-05 09:05:57.242217", + "step": 2160, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2160", + "timestamp": "2025-09-05 09:05:57.693603", + "step": 2160, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:57.858500", + "step": 2160, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35953032970428467, + "timestamp": "2025-09-05 09:05:57.860617", + "step": 2161, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:58.029205", + "step": 2161, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4004409611225128, + "timestamp": "2025-09-05 09:05:58.030979", + "step": 2162, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:58.198364", + "step": 2162, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45261430740356445, + "timestamp": "2025-09-05 09:05:58.200549", + "step": 2163, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:58.367098", + "step": 2163, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1983097940683365, + "timestamp": "2025-09-05 09:05:58.382083", + "step": 2164, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:58.571827", + "step": 2164, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2994425594806671, + "timestamp": "2025-09-05 09:05:58.573488", + "step": 2165, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:05:58.743371", + "step": 2165, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3872717618942261, + "timestamp": "2025-09-05 09:05:58.745170", + "step": 2166, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:58.953533", + "step": 2166, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2745550572872162, + "timestamp": "2025-09-05 09:05:58.955400", + "step": 2167, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:05:59.164803", + "step": 2167, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3339973986148834, + "timestamp": "2025-09-05 09:05:59.180082", + "step": 2168, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:05:59.375968", + "step": 2168, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4272925555706024, + "timestamp": "2025-09-05 09:05:59.378179", + "step": 2169, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:59.586272", + "step": 2169, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2760466933250427, + "timestamp": "2025-09-05 09:05:59.588264", + "step": 2170, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:59.795018", + "step": 2170, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25868964195251465, + "timestamp": "2025-09-05 09:05:59.796794", + "step": 2171, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:05:59.992597", + "step": 2171, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2131020873785019, + "timestamp": "2025-09-05 09:06:00.009053", + "step": 2172, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:00.205797", + "step": 2172, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3480170667171478, + "timestamp": "2025-09-05 09:06:00.207987", + "step": 2173, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:00.375416", + "step": 2173, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38949131965637207, + "timestamp": "2025-09-05 09:06:00.377072", + "step": 2174, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:06:00.582310", + "step": 2174, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3337663412094116, + "timestamp": "2025-09-05 09:06:00.584027", + "step": 2175, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:00.780812", + "step": 2175, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.48506397008895874, + "timestamp": "2025-09-05 09:06:00.796080", + "step": 2176, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:00.988006", + "step": 2176, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34281837940216064, + "timestamp": "2025-09-05 09:06:00.989875", + "step": 2177, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:01.188090", + "step": 2177, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2596714496612549, + "timestamp": "2025-09-05 09:06:01.189855", + "step": 2178, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:01.360399", + "step": 2178, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20409443974494934, + "timestamp": "2025-09-05 09:06:01.362078", + "step": 2179, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:01.567175", + "step": 2179, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34518057107925415, + "timestamp": "2025-09-05 09:06:01.581103", + "step": 2180, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:06.460758", + "step": 2180, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.87232859467735, + "timestamp": "2025-09-05 09:06:06.462417", + "step": 2180, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:06.626251", + "step": 2180, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3369835317134857, + "timestamp": "2025-09-05 09:06:06.628361", + "step": 2181, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:06.797342", + "step": 2181, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3712189793586731, + "timestamp": "2025-09-05 09:06:06.798930", + "step": 2182, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:06.969597", + "step": 2182, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3187856376171112, + "timestamp": "2025-09-05 09:06:06.972251", + "step": 2183, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:07.140449", + "step": 2183, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20823392271995544, + "timestamp": "2025-09-05 09:06:07.158686", + "step": 2184, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:07.365847", + "step": 2184, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3842635154724121, + "timestamp": "2025-09-05 09:06:07.372022", + "step": 2185, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:07.583304", + "step": 2185, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24682661890983582, + "timestamp": "2025-09-05 09:06:07.586882", + "step": 2186, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:07.785086", + "step": 2186, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32766950130462646, + "timestamp": "2025-09-05 09:06:07.788027", + "step": 2187, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:07.989795", + "step": 2187, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.49860307574272156, + "timestamp": "2025-09-05 09:06:08.007831", + "step": 2188, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:08.211143", + "step": 2188, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4436277449131012, + "timestamp": "2025-09-05 09:06:08.213455", + "step": 2189, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:08.420670", + "step": 2189, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24359887838363647, + "timestamp": "2025-09-05 09:06:08.423027", + "step": 2190, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:08.629797", + "step": 2190, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2521995007991791, + "timestamp": "2025-09-05 09:06:08.632822", + "step": 2191, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:08.799043", + "step": 2191, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4108291566371918, + "timestamp": "2025-09-05 09:06:08.814395", + "step": 2192, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:09.001706", + "step": 2192, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4450414478778839, + "timestamp": "2025-09-05 09:06:09.004367", + "step": 2193, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:09.174249", + "step": 2193, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3344271779060364, + "timestamp": "2025-09-05 09:06:09.176943", + "step": 2194, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:09.385077", + "step": 2194, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3703814744949341, + "timestamp": "2025-09-05 09:06:09.388491", + "step": 2195, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:09.556016", + "step": 2195, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44833752512931824, + "timestamp": "2025-09-05 09:06:09.566350", + "step": 2196, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:09.730307", + "step": 2196, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3722361922264099, + "timestamp": "2025-09-05 09:06:09.732642", + "step": 2197, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:09.938807", + "step": 2197, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43370842933654785, + "timestamp": "2025-09-05 09:06:09.941992", + "step": 2198, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:10.138574", + "step": 2198, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27109774947166443, + "timestamp": "2025-09-05 09:06:10.142016", + "step": 2199, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:10.347324", + "step": 2199, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23769542574882507, + "timestamp": "2025-09-05 09:06:10.365235", + "step": 2200, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:15.248799", + "step": 2200, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.81018461535365, + "timestamp": "2025-09-05 09:06:15.251062", + "step": 2200, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2200", + "timestamp": "2025-09-05 09:06:15.710155", + "step": 2200, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:15.875503", + "step": 2200, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3465867042541504, + "timestamp": "2025-09-05 09:06:15.877472", + "step": 2201, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:16.045988", + "step": 2201, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28614541888237, + "timestamp": "2025-09-05 09:06:16.047770", + "step": 2202, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:16.215359", + "step": 2202, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2435327023267746, + "timestamp": "2025-09-05 09:06:16.217829", + "step": 2203, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:16.412558", + "step": 2203, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28242167830467224, + "timestamp": "2025-09-05 09:06:16.429917", + "step": 2204, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:16.627712", + "step": 2204, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40695592761039734, + "timestamp": "2025-09-05 09:06:16.629514", + "step": 2205, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:16.796399", + "step": 2205, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2968638241291046, + "timestamp": "2025-09-05 09:06:16.798607", + "step": 2206, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:16.995784", + "step": 2206, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29859426617622375, + "timestamp": "2025-09-05 09:06:16.997868", + "step": 2207, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:17.204998", + "step": 2207, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2591385841369629, + "timestamp": "2025-09-05 09:06:17.219261", + "step": 2208, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:17.409143", + "step": 2208, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3400825262069702, + "timestamp": "2025-09-05 09:06:17.440817", + "step": 2209, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:17.693884", + "step": 2209, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3577132821083069, + "timestamp": "2025-09-05 09:06:17.695615", + "step": 2210, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:17.895401", + "step": 2210, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.346355140209198, + "timestamp": "2025-09-05 09:06:17.897200", + "step": 2211, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:18.065043", + "step": 2211, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.302179217338562, + "timestamp": "2025-09-05 09:06:18.081108", + "step": 2212, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:18.271084", + "step": 2212, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27992647886276245, + "timestamp": "2025-09-05 09:06:18.273478", + "step": 2213, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:18.440962", + "step": 2213, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2065676599740982, + "timestamp": "2025-09-05 09:06:18.443396", + "step": 2214, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:18.655269", + "step": 2214, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28311607241630554, + "timestamp": "2025-09-05 09:06:18.657127", + "step": 2215, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:18.824537", + "step": 2215, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3312028646469116, + "timestamp": "2025-09-05 09:06:18.838942", + "step": 2216, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:19.028366", + "step": 2216, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2649993896484375, + "timestamp": "2025-09-05 09:06:19.030572", + "step": 2217, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:19.239728", + "step": 2217, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2982449233531952, + "timestamp": "2025-09-05 09:06:19.241520", + "step": 2218, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:19.450240", + "step": 2218, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27559399604797363, + "timestamp": "2025-09-05 09:06:19.454792", + "step": 2219, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:19.653922", + "step": 2219, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28188076615333557, + "timestamp": "2025-09-05 09:06:19.671388", + "step": 2220, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:24.361464", + "step": 2220, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.25948775718098, + "timestamp": "2025-09-05 09:06:24.364366", + "step": 2220, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:24.528802", + "step": 2220, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28795936703681946, + "timestamp": "2025-09-05 09:06:24.530496", + "step": 2221, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:24.697761", + "step": 2221, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40923991799354553, + "timestamp": "2025-09-05 09:06:24.699852", + "step": 2222, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:24.867605", + "step": 2222, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31736886501312256, + "timestamp": "2025-09-05 09:06:24.869794", + "step": 2223, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:25.036879", + "step": 2223, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2096620500087738, + "timestamp": "2025-09-05 09:06:25.051680", + "step": 2224, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:25.239543", + "step": 2224, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.484735906124115, + "timestamp": "2025-09-05 09:06:25.241461", + "step": 2225, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:25.438820", + "step": 2225, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34063124656677246, + "timestamp": "2025-09-05 09:06:25.441718", + "step": 2226, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:25.637874", + "step": 2226, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29765599966049194, + "timestamp": "2025-09-05 09:06:25.639588", + "step": 2227, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:25.805672", + "step": 2227, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24793501198291779, + "timestamp": "2025-09-05 09:06:25.814750", + "step": 2228, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:25.976719", + "step": 2228, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33141300082206726, + "timestamp": "2025-09-05 09:06:25.979149", + "step": 2229, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:26.186014", + "step": 2229, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3666149079799652, + "timestamp": "2025-09-05 09:06:26.188475", + "step": 2230, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:26.475436", + "step": 2230, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23982176184654236, + "timestamp": "2025-09-05 09:06:26.477600", + "step": 2231, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:26.673606", + "step": 2231, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23586024343967438, + "timestamp": "2025-09-05 09:06:26.684063", + "step": 2232, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:26.847693", + "step": 2232, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22031444311141968, + "timestamp": "2025-09-05 09:06:26.849808", + "step": 2233, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:27.016397", + "step": 2233, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2034074366092682, + "timestamp": "2025-09-05 09:06:27.018349", + "step": 2234, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:27.224905", + "step": 2234, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3312498927116394, + "timestamp": "2025-09-05 09:06:27.226712", + "step": 2235, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:27.393791", + "step": 2235, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.16999323666095734, + "timestamp": "2025-09-05 09:06:27.410448", + "step": 2236, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:27.606051", + "step": 2236, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.262175053358078, + "timestamp": "2025-09-05 09:06:27.607922", + "step": 2237, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:27.804022", + "step": 2237, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3137682378292084, + "timestamp": "2025-09-05 09:06:27.805913", + "step": 2238, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:28.012450", + "step": 2238, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40954217314720154, + "timestamp": "2025-09-05 09:06:28.014922", + "step": 2239, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:28.182501", + "step": 2239, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38842806220054626, + "timestamp": "2025-09-05 09:06:28.199729", + "step": 2240, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:33.159795", + "step": 2240, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.81411080714311, + "timestamp": "2025-09-05 09:06:33.161733", + "step": 2240, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2240", + "timestamp": "2025-09-05 09:06:33.625419", + "step": 2240, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:33.789231", + "step": 2240, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30577048659324646, + "timestamp": "2025-09-05 09:06:33.791819", + "step": 2241, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:33.959669", + "step": 2241, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2516123950481415, + "timestamp": "2025-09-05 09:06:33.961351", + "step": 2242, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:34.128513", + "step": 2242, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29832541942596436, + "timestamp": "2025-09-05 09:06:34.130926", + "step": 2243, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:34.326975", + "step": 2243, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.352728009223938, + "timestamp": "2025-09-05 09:06:34.336301", + "step": 2244, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:34.500913", + "step": 2244, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29814958572387695, + "timestamp": "2025-09-05 09:06:34.503201", + "step": 2245, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:34.671218", + "step": 2245, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3409227728843689, + "timestamp": "2025-09-05 09:06:34.673286", + "step": 2246, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:06:34.867144", + "step": 2246, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37319034337997437, + "timestamp": "2025-09-05 09:06:34.869145", + "step": 2247, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:35.035883", + "step": 2247, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.378897100687027, + "timestamp": "2025-09-05 09:06:35.050269", + "step": 2248, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:35.241028", + "step": 2248, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47297292947769165, + "timestamp": "2025-09-05 09:06:35.243250", + "step": 2249, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:35.448153", + "step": 2249, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2587435245513916, + "timestamp": "2025-09-05 09:06:35.450194", + "step": 2250, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:35.620591", + "step": 2250, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21650661528110504, + "timestamp": "2025-09-05 09:06:35.622564", + "step": 2251, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:35.819398", + "step": 2251, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3861205279827118, + "timestamp": "2025-09-05 09:06:35.834116", + "step": 2252, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:36.023040", + "step": 2252, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24851293861865997, + "timestamp": "2025-09-05 09:06:36.025229", + "step": 2253, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:36.193242", + "step": 2253, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38230419158935547, + "timestamp": "2025-09-05 09:06:36.195357", + "step": 2254, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:36.401772", + "step": 2254, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23924830555915833, + "timestamp": "2025-09-05 09:06:36.403897", + "step": 2255, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:36.601332", + "step": 2255, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23087681829929352, + "timestamp": "2025-09-05 09:06:36.616592", + "step": 2256, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:36.803359", + "step": 2256, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31772562861442566, + "timestamp": "2025-09-05 09:06:36.805156", + "step": 2257, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:37.000355", + "step": 2257, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37587422132492065, + "timestamp": "2025-09-05 09:06:37.002348", + "step": 2258, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:37.169293", + "step": 2258, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5363959074020386, + "timestamp": "2025-09-05 09:06:37.171488", + "step": 2259, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:37.376633", + "step": 2259, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3956094980239868, + "timestamp": "2025-09-05 09:06:37.393078", + "step": 2260, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:42.110986", + "step": 2260, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.873000385281806, + "timestamp": "2025-09-05 09:06:42.113444", + "step": 2260, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:42.274354", + "step": 2260, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3519768714904785, + "timestamp": "2025-09-05 09:06:42.276425", + "step": 2261, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:42.526981", + "step": 2261, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2870665192604065, + "timestamp": "2025-09-05 09:06:42.528820", + "step": 2262, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:42.732781", + "step": 2262, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3545304238796234, + "timestamp": "2025-09-05 09:06:42.734675", + "step": 2263, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:42.932972", + "step": 2263, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2984987497329712, + "timestamp": "2025-09-05 09:06:42.943221", + "step": 2264, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:43.105999", + "step": 2264, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3972173035144806, + "timestamp": "2025-09-05 09:06:43.108100", + "step": 2265, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:43.275800", + "step": 2265, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3706391751766205, + "timestamp": "2025-09-05 09:06:43.277701", + "step": 2266, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:43.471167", + "step": 2266, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36558616161346436, + "timestamp": "2025-09-05 09:06:43.473657", + "step": 2267, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:43.641189", + "step": 2267, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2613338232040405, + "timestamp": "2025-09-05 09:06:43.658410", + "step": 2268, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:43.856335", + "step": 2268, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3643812835216522, + "timestamp": "2025-09-05 09:06:43.858525", + "step": 2269, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:44.057101", + "step": 2269, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3404228091239929, + "timestamp": "2025-09-05 09:06:44.059038", + "step": 2270, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:44.228781", + "step": 2270, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4255596101284027, + "timestamp": "2025-09-05 09:06:44.230693", + "step": 2271, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:44.399436", + "step": 2271, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3181018829345703, + "timestamp": "2025-09-05 09:06:44.408822", + "step": 2272, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:44.571743", + "step": 2272, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2214895486831665, + "timestamp": "2025-09-05 09:06:44.573526", + "step": 2273, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:44.740584", + "step": 2273, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34708890318870544, + "timestamp": "2025-09-05 09:06:44.742388", + "step": 2274, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:44.938691", + "step": 2274, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39245232939720154, + "timestamp": "2025-09-05 09:06:44.940467", + "step": 2275, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:45.109042", + "step": 2275, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30886292457580566, + "timestamp": "2025-09-05 09:06:45.126949", + "step": 2276, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:45.324958", + "step": 2276, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3094596564769745, + "timestamp": "2025-09-05 09:06:45.327565", + "step": 2277, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:45.532781", + "step": 2277, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3498867452144623, + "timestamp": "2025-09-05 09:06:45.534636", + "step": 2278, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:45.741366", + "step": 2278, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4171161651611328, + "timestamp": "2025-09-05 09:06:45.743093", + "step": 2279, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:45.912226", + "step": 2279, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3185728192329407, + "timestamp": "2025-09-05 09:06:45.927001", + "step": 2280, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:50.639787", + "step": 2280, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.187925265235314, + "timestamp": "2025-09-05 09:06:50.641591", + "step": 2280, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2280", + "timestamp": "2025-09-05 09:06:51.097949", + "step": 2280, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:51.263440", + "step": 2280, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31183862686157227, + "timestamp": "2025-09-05 09:06:51.265652", + "step": 2281, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:51.433139", + "step": 2281, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39186206459999084, + "timestamp": "2025-09-05 09:06:51.435416", + "step": 2282, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:51.642913", + "step": 2282, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30460259318351746, + "timestamp": "2025-09-05 09:06:51.644628", + "step": 2283, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:51.811414", + "step": 2283, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3289051055908203, + "timestamp": "2025-09-05 09:06:51.827980", + "step": 2284, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:52.021861", + "step": 2284, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3493216335773468, + "timestamp": "2025-09-05 09:06:52.023824", + "step": 2285, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:52.188519", + "step": 2285, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1778910756111145, + "timestamp": "2025-09-05 09:06:52.190599", + "step": 2286, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:52.388190", + "step": 2286, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22400565445423126, + "timestamp": "2025-09-05 09:06:52.390250", + "step": 2287, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:52.595404", + "step": 2287, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2981906533241272, + "timestamp": "2025-09-05 09:06:52.604955", + "step": 2288, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:52.767428", + "step": 2288, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26950085163116455, + "timestamp": "2025-09-05 09:06:52.769577", + "step": 2289, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:06:52.935744", + "step": 2289, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29919737577438354, + "timestamp": "2025-09-05 09:06:52.937652", + "step": 2290, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:53.134942", + "step": 2290, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35156911611557007, + "timestamp": "2025-09-05 09:06:53.136642", + "step": 2291, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:53.303716", + "step": 2291, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.376793771982193, + "timestamp": "2025-09-05 09:06:53.319226", + "step": 2292, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:53.514148", + "step": 2292, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38039150834083557, + "timestamp": "2025-09-05 09:06:53.515929", + "step": 2293, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:53.682172", + "step": 2293, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29825207591056824, + "timestamp": "2025-09-05 09:06:53.684304", + "step": 2294, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:53.880490", + "step": 2294, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31032633781433105, + "timestamp": "2025-09-05 09:06:53.882195", + "step": 2295, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:06:54.047414", + "step": 2295, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2269105762243271, + "timestamp": "2025-09-05 09:06:54.064076", + "step": 2296, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:54.308280", + "step": 2296, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2711460590362549, + "timestamp": "2025-09-05 09:06:54.310952", + "step": 2297, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:06:54.522080", + "step": 2297, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32219910621643066, + "timestamp": "2025-09-05 09:06:54.523953", + "step": 2298, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:54.730976", + "step": 2298, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3553353548049927, + "timestamp": "2025-09-05 09:06:54.732882", + "step": 2299, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:06:54.899746", + "step": 2299, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3341793119907379, + "timestamp": "2025-09-05 09:06:54.915155", + "step": 2300, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:06:59.573196", + "step": 2300, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.83812948418028, + "timestamp": "2025-09-05 09:06:59.576460", + "step": 2300, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:06:59.738712", + "step": 2300, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37382030487060547, + "timestamp": "2025-09-05 09:06:59.740310", + "step": 2301, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:06:59.907105", + "step": 2301, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3155753016471863, + "timestamp": "2025-09-05 09:06:59.908756", + "step": 2302, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:00.113462", + "step": 2302, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3371022045612335, + "timestamp": "2025-09-05 09:07:00.115231", + "step": 2303, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:00.310344", + "step": 2303, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3813401162624359, + "timestamp": "2025-09-05 09:07:00.324859", + "step": 2304, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:00.512247", + "step": 2304, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3163757920265198, + "timestamp": "2025-09-05 09:07:00.513868", + "step": 2305, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:00.709592", + "step": 2305, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4015030264854431, + "timestamp": "2025-09-05 09:07:00.713421", + "step": 2306, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:00.911204", + "step": 2306, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.241668239235878, + "timestamp": "2025-09-05 09:07:00.912870", + "step": 2307, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:01.112322", + "step": 2307, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3123008906841278, + "timestamp": "2025-09-05 09:07:01.122844", + "step": 2308, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:01.284697", + "step": 2308, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22155259549617767, + "timestamp": "2025-09-05 09:07:01.286352", + "step": 2309, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:01.490379", + "step": 2309, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2877575755119324, + "timestamp": "2025-09-05 09:07:01.492069", + "step": 2310, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:01.696638", + "step": 2310, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28378915786743164, + "timestamp": "2025-09-05 09:07:01.698262", + "step": 2311, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:01.904919", + "step": 2311, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.305711030960083, + "timestamp": "2025-09-05 09:07:01.914270", + "step": 2312, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:02.076917", + "step": 2312, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3109692633152008, + "timestamp": "2025-09-05 09:07:02.079113", + "step": 2313, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:02.283526", + "step": 2313, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24978035688400269, + "timestamp": "2025-09-05 09:07:02.285251", + "step": 2314, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:02.482566", + "step": 2314, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35233667492866516, + "timestamp": "2025-09-05 09:07:02.484199", + "step": 2315, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:02.679086", + "step": 2315, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24674127995967865, + "timestamp": "2025-09-05 09:07:02.696266", + "step": 2316, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:02.892813", + "step": 2316, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3011695444583893, + "timestamp": "2025-09-05 09:07:02.894598", + "step": 2317, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:03.063154", + "step": 2317, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30131420493125916, + "timestamp": "2025-09-05 09:07:03.065608", + "step": 2318, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:03.271933", + "step": 2318, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3193568289279938, + "timestamp": "2025-09-05 09:07:03.273728", + "step": 2319, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:03.479048", + "step": 2319, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2760051488876343, + "timestamp": "2025-09-05 09:07:03.493537", + "step": 2320, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:08.176711", + "step": 2320, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.84229306421487, + "timestamp": "2025-09-05 09:07:08.178871", + "step": 2320, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2320", + "timestamp": "2025-09-05 09:07:08.636200", + "step": 2320, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:08.804432", + "step": 2320, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3730832636356354, + "timestamp": "2025-09-05 09:07:08.806721", + "step": 2321, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:09.011245", + "step": 2321, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47483229637145996, + "timestamp": "2025-09-05 09:07:09.013084", + "step": 2322, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:09.218081", + "step": 2322, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26618441939353943, + "timestamp": "2025-09-05 09:07:09.219997", + "step": 2323, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:09.416051", + "step": 2323, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.262174516916275, + "timestamp": "2025-09-05 09:07:09.425547", + "step": 2324, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:09.587653", + "step": 2324, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.43635013699531555, + "timestamp": "2025-09-05 09:07:09.589717", + "step": 2325, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:09.796156", + "step": 2325, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2052556276321411, + "timestamp": "2025-09-05 09:07:09.797976", + "step": 2326, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:09.967138", + "step": 2326, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3712131083011627, + "timestamp": "2025-09-05 09:07:09.969415", + "step": 2327, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:10.164137", + "step": 2327, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35729169845581055, + "timestamp": "2025-09-05 09:07:10.178407", + "step": 2328, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:10.368304", + "step": 2328, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39892011880874634, + "timestamp": "2025-09-05 09:07:10.370520", + "step": 2329, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:10.567236", + "step": 2329, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36499282717704773, + "timestamp": "2025-09-05 09:07:10.569442", + "step": 2330, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:10.780649", + "step": 2330, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23433853685855865, + "timestamp": "2025-09-05 09:07:10.782762", + "step": 2331, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:10.979945", + "step": 2331, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24312803149223328, + "timestamp": "2025-09-05 09:07:10.994537", + "step": 2332, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:11.183633", + "step": 2332, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26388394832611084, + "timestamp": "2025-09-05 09:07:11.185595", + "step": 2333, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:11.393159", + "step": 2333, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4343104660511017, + "timestamp": "2025-09-05 09:07:11.395175", + "step": 2334, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:07:11.593175", + "step": 2334, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39986181259155273, + "timestamp": "2025-09-05 09:07:11.594881", + "step": 2335, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:11.793473", + "step": 2335, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.48972994089126587, + "timestamp": "2025-09-05 09:07:11.802839", + "step": 2336, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:11.967517", + "step": 2336, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2836100459098816, + "timestamp": "2025-09-05 09:07:11.969198", + "step": 2337, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:12.174955", + "step": 2337, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2238231599330902, + "timestamp": "2025-09-05 09:07:12.177197", + "step": 2338, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:07:12.374452", + "step": 2338, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3275935649871826, + "timestamp": "2025-09-05 09:07:12.377033", + "step": 2339, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:12.573386", + "step": 2339, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31032243371009827, + "timestamp": "2025-09-05 09:07:12.587393", + "step": 2340, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:17.348659", + "step": 2340, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.3609793507131, + "timestamp": "2025-09-05 09:07:17.350682", + "step": 2340, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:17.511929", + "step": 2340, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37032943964004517, + "timestamp": "2025-09-05 09:07:17.514323", + "step": 2341, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:17.721157", + "step": 2341, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3330005407333374, + "timestamp": "2025-09-05 09:07:17.723262", + "step": 2342, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:17.919750", + "step": 2342, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26726004481315613, + "timestamp": "2025-09-05 09:07:17.922084", + "step": 2343, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:18.129709", + "step": 2343, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2254151552915573, + "timestamp": "2025-09-05 09:07:18.139944", + "step": 2344, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:18.302073", + "step": 2344, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2709170877933502, + "timestamp": "2025-09-05 09:07:18.306175", + "step": 2345, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:18.520095", + "step": 2345, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36739596724510193, + "timestamp": "2025-09-05 09:07:18.526131", + "step": 2346, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:18.693682", + "step": 2346, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24793685972690582, + "timestamp": "2025-09-05 09:07:18.699367", + "step": 2347, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:18.908254", + "step": 2347, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25923240184783936, + "timestamp": "2025-09-05 09:07:18.917770", + "step": 2348, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:19.083149", + "step": 2348, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4075714349746704, + "timestamp": "2025-09-05 09:07:19.086080", + "step": 2349, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:19.253820", + "step": 2349, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22864657640457153, + "timestamp": "2025-09-05 09:07:19.269288", + "step": 2350, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:19.517188", + "step": 2350, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2772950828075409, + "timestamp": "2025-09-05 09:07:19.520982", + "step": 2351, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:19.693977", + "step": 2351, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.237502783536911, + "timestamp": "2025-09-05 09:07:19.710552", + "step": 2352, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:19.910056", + "step": 2352, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3322488069534302, + "timestamp": "2025-09-05 09:07:19.917125", + "step": 2353, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:20.116870", + "step": 2353, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39339199662208557, + "timestamp": "2025-09-05 09:07:20.119561", + "step": 2354, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:20.327031", + "step": 2354, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31205451488494873, + "timestamp": "2025-09-05 09:07:20.331019", + "step": 2355, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:20.540792", + "step": 2355, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19401071965694427, + "timestamp": "2025-09-05 09:07:20.550029", + "step": 2356, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:20.714882", + "step": 2356, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27951422333717346, + "timestamp": "2025-09-05 09:07:20.716627", + "step": 2357, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:20.924647", + "step": 2357, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3359452486038208, + "timestamp": "2025-09-05 09:07:20.926530", + "step": 2358, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:21.094918", + "step": 2358, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32733088731765747, + "timestamp": "2025-09-05 09:07:21.096900", + "step": 2359, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:21.294512", + "step": 2359, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27296555042266846, + "timestamp": "2025-09-05 09:07:21.304367", + "step": 2360, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:26.011972", + "step": 2360, + "epoch": 2 + }, + { + "type": "pplx", + "content": 55.69164641376557, + "timestamp": "2025-09-05 09:07:26.014599", + "step": 2360, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2360", + "timestamp": "2025-09-05 09:07:26.471307", + "step": 2360, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:07:26.639440", + "step": 2360, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3209402859210968, + "timestamp": "2025-09-05 09:07:26.641714", + "step": 2361, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:26.845836", + "step": 2361, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2272927612066269, + "timestamp": "2025-09-05 09:07:26.848505", + "step": 2362, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:27.045204", + "step": 2362, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3490237295627594, + "timestamp": "2025-09-05 09:07:27.047647", + "step": 2363, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:27.253023", + "step": 2363, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3624802231788635, + "timestamp": "2025-09-05 09:07:27.267156", + "step": 2364, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:27.457605", + "step": 2364, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36499476432800293, + "timestamp": "2025-09-05 09:07:27.459374", + "step": 2365, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:27.655960", + "step": 2365, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28135761618614197, + "timestamp": "2025-09-05 09:07:27.658541", + "step": 2366, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:27.858711", + "step": 2366, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.345076322555542, + "timestamp": "2025-09-05 09:07:27.860970", + "step": 2367, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:28.059087", + "step": 2367, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41919395327568054, + "timestamp": "2025-09-05 09:07:28.068579", + "step": 2368, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:07:28.231677", + "step": 2368, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2436763346195221, + "timestamp": "2025-09-05 09:07:28.233668", + "step": 2369, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:28.399911", + "step": 2369, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23662683367729187, + "timestamp": "2025-09-05 09:07:28.402152", + "step": 2370, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:28.597574", + "step": 2370, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22401820123195648, + "timestamp": "2025-09-05 09:07:28.599367", + "step": 2371, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:07:28.765558", + "step": 2371, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23337076604366302, + "timestamp": "2025-09-05 09:07:28.782887", + "step": 2372, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:28.977131", + "step": 2372, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3868841230869293, + "timestamp": "2025-09-05 09:07:28.979493", + "step": 2373, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:29.176816", + "step": 2373, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.47699031233787537, + "timestamp": "2025-09-05 09:07:29.184646", + "step": 2374, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:29.396413", + "step": 2374, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.437484472990036, + "timestamp": "2025-09-05 09:07:29.398070", + "step": 2375, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:29.603081", + "step": 2375, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22171063721179962, + "timestamp": "2025-09-05 09:07:29.611960", + "step": 2376, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:29.772947", + "step": 2376, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2754736840724945, + "timestamp": "2025-09-05 09:07:29.774820", + "step": 2377, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:29.976846", + "step": 2377, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2224128544330597, + "timestamp": "2025-09-05 09:07:29.978350", + "step": 2378, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:30.142774", + "step": 2378, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26598674058914185, + "timestamp": "2025-09-05 09:07:30.144652", + "step": 2379, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:30.338071", + "step": 2379, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31558096408843994, + "timestamp": "2025-09-05 09:07:30.347353", + "step": 2380, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:35.003062", + "step": 2380, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.91370123427139, + "timestamp": "2025-09-05 09:07:35.005288", + "step": 2380, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:35.166638", + "step": 2380, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3278944790363312, + "timestamp": "2025-09-05 09:07:35.168373", + "step": 2381, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:35.334452", + "step": 2381, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3350487947463989, + "timestamp": "2025-09-05 09:07:35.336389", + "step": 2382, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:35.502370", + "step": 2382, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2530112862586975, + "timestamp": "2025-09-05 09:07:35.504209", + "step": 2383, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:35.671250", + "step": 2383, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34709376096725464, + "timestamp": "2025-09-05 09:07:35.680926", + "step": 2384, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:35.845859", + "step": 2384, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2088186889886856, + "timestamp": "2025-09-05 09:07:35.847816", + "step": 2385, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:36.013801", + "step": 2385, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2869564890861511, + "timestamp": "2025-09-05 09:07:36.015947", + "step": 2386, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:36.181958", + "step": 2386, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3361554741859436, + "timestamp": "2025-09-05 09:07:36.184163", + "step": 2387, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:36.352174", + "step": 2387, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35683155059814453, + "timestamp": "2025-09-05 09:07:36.361191", + "step": 2388, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:36.525010", + "step": 2388, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4390067756175995, + "timestamp": "2025-09-05 09:07:36.526684", + "step": 2389, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:07:36.691073", + "step": 2389, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3709789514541626, + "timestamp": "2025-09-05 09:07:36.693111", + "step": 2390, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:36.862890", + "step": 2390, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2603262960910797, + "timestamp": "2025-09-05 09:07:36.865311", + "step": 2391, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:37.031735", + "step": 2391, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23307450115680695, + "timestamp": "2025-09-05 09:07:37.041183", + "step": 2392, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:37.204920", + "step": 2392, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27466168999671936, + "timestamp": "2025-09-05 09:07:37.206775", + "step": 2393, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:07:37.373885", + "step": 2393, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2780590355396271, + "timestamp": "2025-09-05 09:07:37.375972", + "step": 2394, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:37.542987", + "step": 2394, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17993779480457306, + "timestamp": "2025-09-05 09:07:37.545373", + "step": 2395, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:37.711127", + "step": 2395, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33172348141670227, + "timestamp": "2025-09-05 09:07:37.777082", + "step": 2396, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:37.982581", + "step": 2396, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23976249992847443, + "timestamp": "2025-09-05 09:07:37.985053", + "step": 2397, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:38.166172", + "step": 2397, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3238065540790558, + "timestamp": "2025-09-05 09:07:38.168100", + "step": 2398, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:38.327669", + "step": 2398, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2310824692249298, + "timestamp": "2025-09-05 09:07:38.329756", + "step": 2399, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:38.488943", + "step": 2399, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.13710328936576843, + "timestamp": "2025-09-05 09:07:38.503058", + "step": 2400, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:43.150193", + "step": 2400, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.65770720978935, + "timestamp": "2025-09-05 09:07:43.152234", + "step": 2400, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2400", + "timestamp": "2025-09-05 09:07:43.622167", + "step": 2400, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:43.756497", + "step": 2400, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3784136474132538, + "timestamp": "2025-09-05 09:07:43.758846", + "step": 2401, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:43.928455", + "step": 2401, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28840699791908264, + "timestamp": "2025-09-05 09:07:43.930366", + "step": 2402, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:44.100558", + "step": 2402, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.12435296177864075, + "timestamp": "2025-09-05 09:07:44.103005", + "step": 2403, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:44.339719", + "step": 2403, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19681872427463531, + "timestamp": "2025-09-05 09:07:44.355064", + "step": 2404, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:44.509599", + "step": 2404, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29926079511642456, + "timestamp": "2025-09-05 09:07:44.511796", + "step": 2405, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:44.682182", + "step": 2405, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39890509843826294, + "timestamp": "2025-09-05 09:07:44.684493", + "step": 2406, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:44.855392", + "step": 2406, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34845206141471863, + "timestamp": "2025-09-05 09:07:44.857439", + "step": 2407, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 272 + ], + "flops": 5440033091648.0 + }, + "timestamp": "2025-09-05 09:07:45.021834", + "step": 2407, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4749622046947479, + "timestamp": "2025-09-05 09:07:45.036535", + "step": 2408, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:45.289008", + "step": 2408, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17726445198059082, + "timestamp": "2025-09-05 09:07:45.314028", + "step": 2409, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:45.497454", + "step": 2409, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3263298571109772, + "timestamp": "2025-09-05 09:07:45.514542", + "step": 2410, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:45.745066", + "step": 2410, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2775588929653168, + "timestamp": "2025-09-05 09:07:45.747377", + "step": 2411, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:45.919583", + "step": 2411, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3065100610256195, + "timestamp": "2025-09-05 09:07:45.975180", + "step": 2412, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:46.147292", + "step": 2412, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28606003522872925, + "timestamp": "2025-09-05 09:07:46.149413", + "step": 2413, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:46.320168", + "step": 2413, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2223246544599533, + "timestamp": "2025-09-05 09:07:46.322268", + "step": 2414, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:46.481163", + "step": 2414, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4483068585395813, + "timestamp": "2025-09-05 09:07:46.483191", + "step": 2415, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:46.678206", + "step": 2415, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31489551067352295, + "timestamp": "2025-09-05 09:07:46.692575", + "step": 2416, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:07:46.844050", + "step": 2416, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2865988612174988, + "timestamp": "2025-09-05 09:07:46.846269", + "step": 2417, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:47.012544", + "step": 2417, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3174918293952942, + "timestamp": "2025-09-05 09:07:47.014872", + "step": 2418, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:47.186121", + "step": 2418, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26380813121795654, + "timestamp": "2025-09-05 09:07:47.188686", + "step": 2419, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:47.360860", + "step": 2419, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25339600443840027, + "timestamp": "2025-09-05 09:07:47.375096", + "step": 2420, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:07:52.646084", + "step": 2420, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.45162833560751, + "timestamp": "2025-09-05 09:07:52.648300", + "step": 2420, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:07:52.779251", + "step": 2420, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21093280613422394, + "timestamp": "2025-09-05 09:07:52.781980", + "step": 2421, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:52.938785", + "step": 2421, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24759776890277863, + "timestamp": "2025-09-05 09:07:53.020176", + "step": 2422, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:53.213342", + "step": 2422, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19092871248722076, + "timestamp": "2025-09-05 09:07:53.215194", + "step": 2423, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:53.382864", + "step": 2423, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38986799120903015, + "timestamp": "2025-09-05 09:07:53.396260", + "step": 2424, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:53.548122", + "step": 2424, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25403735041618347, + "timestamp": "2025-09-05 09:07:53.550389", + "step": 2425, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:53.707715", + "step": 2425, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.39463475346565247, + "timestamp": "2025-09-05 09:07:53.736535", + "step": 2426, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:07:53.949730", + "step": 2426, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27436649799346924, + "timestamp": "2025-09-05 09:07:53.982038", + "step": 2427, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:54.210833", + "step": 2427, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3307807147502899, + "timestamp": "2025-09-05 09:07:54.225106", + "step": 2428, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:54.380225", + "step": 2428, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5084760785102844, + "timestamp": "2025-09-05 09:07:54.382463", + "step": 2429, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:54.552231", + "step": 2429, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2736986577510834, + "timestamp": "2025-09-05 09:07:54.554064", + "step": 2430, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:54.725829", + "step": 2430, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2925279140472412, + "timestamp": "2025-09-05 09:07:54.728433", + "step": 2431, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:07:54.886581", + "step": 2431, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27017101645469666, + "timestamp": "2025-09-05 09:07:54.903369", + "step": 2432, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:55.063357", + "step": 2432, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2796265780925751, + "timestamp": "2025-09-05 09:07:55.066444", + "step": 2433, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:55.224425", + "step": 2433, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35782158374786377, + "timestamp": "2025-09-05 09:07:55.227028", + "step": 2434, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:07:55.364722", + "step": 2434, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3305521607398987, + "timestamp": "2025-09-05 09:07:55.366733", + "step": 2435, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:55.538296", + "step": 2435, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3081381618976593, + "timestamp": "2025-09-05 09:07:55.555184", + "step": 2436, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:55.716234", + "step": 2436, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3570384681224823, + "timestamp": "2025-09-05 09:07:55.718614", + "step": 2437, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:07:55.888053", + "step": 2437, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.15754249691963196, + "timestamp": "2025-09-05 09:07:55.890184", + "step": 2438, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:07:56.056501", + "step": 2438, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20244644582271576, + "timestamp": "2025-09-05 09:07:56.058763", + "step": 2439, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:07:56.216598", + "step": 2439, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2593131959438324, + "timestamp": "2025-09-05 09:07:56.230745", + "step": 2440, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:01.348684", + "step": 2440, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.726233382312806, + "timestamp": "2025-09-05 09:08:01.352558", + "step": 2440, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2440", + "timestamp": "2025-09-05 09:08:01.916075", + "step": 2440, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:02.075987", + "step": 2440, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22823497653007507, + "timestamp": "2025-09-05 09:08:02.078058", + "step": 2441, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:02.251582", + "step": 2441, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.13066086173057556, + "timestamp": "2025-09-05 09:08:02.254006", + "step": 2442, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:02.418800", + "step": 2442, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3410320580005646, + "timestamp": "2025-09-05 09:08:02.420955", + "step": 2443, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:02.586353", + "step": 2443, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3090682327747345, + "timestamp": "2025-09-05 09:08:02.600282", + "step": 2444, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:02.757752", + "step": 2444, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3452575206756592, + "timestamp": "2025-09-05 09:08:02.760185", + "step": 2445, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:02.927518", + "step": 2445, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31340721249580383, + "timestamp": "2025-09-05 09:08:02.929857", + "step": 2446, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:03.190905", + "step": 2446, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3519221544265747, + "timestamp": "2025-09-05 09:08:03.192880", + "step": 2447, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:03.367264", + "step": 2447, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5222123265266418, + "timestamp": "2025-09-05 09:08:03.381834", + "step": 2448, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:03.540496", + "step": 2448, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25161540508270264, + "timestamp": "2025-09-05 09:08:03.542795", + "step": 2449, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:03.709099", + "step": 2449, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28207287192344666, + "timestamp": "2025-09-05 09:08:03.752015", + "step": 2450, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:04.036556", + "step": 2450, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4279467761516571, + "timestamp": "2025-09-05 09:08:04.039030", + "step": 2451, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:04.354549", + "step": 2451, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32319822907447815, + "timestamp": "2025-09-05 09:08:04.369029", + "step": 2452, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:04.538604", + "step": 2452, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27443209290504456, + "timestamp": "2025-09-05 09:08:04.598961", + "step": 2453, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:04.818752", + "step": 2453, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2456836998462677, + "timestamp": "2025-09-05 09:08:04.821148", + "step": 2454, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:04.988344", + "step": 2454, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35845696926116943, + "timestamp": "2025-09-05 09:08:04.990202", + "step": 2455, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:05.154815", + "step": 2455, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40192878246307373, + "timestamp": "2025-09-05 09:08:05.171087", + "step": 2456, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:05.339221", + "step": 2456, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.38854965567588806, + "timestamp": "2025-09-05 09:08:05.342153", + "step": 2457, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:05.506686", + "step": 2457, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36593395471572876, + "timestamp": "2025-09-05 09:08:05.508864", + "step": 2458, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:05.673878", + "step": 2458, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3558920621871948, + "timestamp": "2025-09-05 09:08:05.676132", + "step": 2459, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:05.841507", + "step": 2459, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23418471217155457, + "timestamp": "2025-09-05 09:08:05.858314", + "step": 2460, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:10.996179", + "step": 2460, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.48732085430296, + "timestamp": "2025-09-05 09:08:11.039993", + "step": 2460, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:11.173071", + "step": 2460, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2769435942173004, + "timestamp": "2025-09-05 09:08:11.189561", + "step": 2461, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:11.452212", + "step": 2461, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44493263959884644, + "timestamp": "2025-09-05 09:08:11.454879", + "step": 2462, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:11.710370", + "step": 2462, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.274783194065094, + "timestamp": "2025-09-05 09:08:11.712867", + "step": 2463, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:11.932413", + "step": 2463, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2961232364177704, + "timestamp": "2025-09-05 09:08:11.945952", + "step": 2464, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:12.106328", + "step": 2464, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.19862420856952667, + "timestamp": "2025-09-05 09:08:12.108278", + "step": 2465, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:12.286236", + "step": 2465, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.49661245942115784, + "timestamp": "2025-09-05 09:08:12.288268", + "step": 2466, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:12.423310", + "step": 2466, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3499586284160614, + "timestamp": "2025-09-05 09:08:12.425494", + "step": 2467, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:12.590694", + "step": 2467, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3866555392742157, + "timestamp": "2025-09-05 09:08:12.604767", + "step": 2468, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:12.762857", + "step": 2468, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.15350095927715302, + "timestamp": "2025-09-05 09:08:12.765656", + "step": 2469, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:12.930539", + "step": 2469, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2772982716560364, + "timestamp": "2025-09-05 09:08:12.933067", + "step": 2470, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:13.150777", + "step": 2470, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2272377461194992, + "timestamp": "2025-09-05 09:08:13.153293", + "step": 2471, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:13.323513", + "step": 2471, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3104439973831177, + "timestamp": "2025-09-05 09:08:13.339831", + "step": 2472, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:13.506609", + "step": 2472, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3005099892616272, + "timestamp": "2025-09-05 09:08:13.509163", + "step": 2473, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:13.728381", + "step": 2473, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32455500960350037, + "timestamp": "2025-09-05 09:08:13.730644", + "step": 2474, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:13.971396", + "step": 2474, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3721053898334503, + "timestamp": "2025-09-05 09:08:13.995417", + "step": 2475, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:14.176746", + "step": 2475, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33817243576049805, + "timestamp": "2025-09-05 09:08:14.194035", + "step": 2476, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:14.364770", + "step": 2476, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4381646513938904, + "timestamp": "2025-09-05 09:08:14.367767", + "step": 2477, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:14.586996", + "step": 2477, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45706042647361755, + "timestamp": "2025-09-05 09:08:14.589275", + "step": 2478, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:14.811775", + "step": 2478, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3688715994358063, + "timestamp": "2025-09-05 09:08:14.813672", + "step": 2479, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:15.011990", + "step": 2479, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.365582138299942, + "timestamp": "2025-09-05 09:08:15.025519", + "step": 2480, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:20.267144", + "step": 2480, + "epoch": 2 + }, + { + "type": "pplx", + "content": 52.49038722586114, + "timestamp": "2025-09-05 09:08:20.269130", + "step": 2480, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2480", + "timestamp": "2025-09-05 09:08:20.748386", + "step": 2480, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:20.908304", + "step": 2480, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3301137685775757, + "timestamp": "2025-09-05 09:08:20.911345", + "step": 2481, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:21.048283", + "step": 2481, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2704985737800598, + "timestamp": "2025-09-05 09:08:21.051353", + "step": 2482, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:21.272855", + "step": 2482, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24071909487247467, + "timestamp": "2025-09-05 09:08:21.316201", + "step": 2483, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:08:21.495818", + "step": 2483, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27359169721603394, + "timestamp": "2025-09-05 09:08:21.585302", + "step": 2484, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:21.776756", + "step": 2484, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30878910422325134, + "timestamp": "2025-09-05 09:08:21.779385", + "step": 2485, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:21.944264", + "step": 2485, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3247639834880829, + "timestamp": "2025-09-05 09:08:21.986129", + "step": 2486, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:22.212722", + "step": 2486, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2619643211364746, + "timestamp": "2025-09-05 09:08:22.233914", + "step": 2487, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:22.458476", + "step": 2487, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26061657071113586, + "timestamp": "2025-09-05 09:08:22.472954", + "step": 2488, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:22.632179", + "step": 2488, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3099912106990814, + "timestamp": "2025-09-05 09:08:22.635510", + "step": 2489, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:22.799538", + "step": 2489, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21963852643966675, + "timestamp": "2025-09-05 09:08:22.802163", + "step": 2490, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:22.978088", + "step": 2490, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3000223636627197, + "timestamp": "2025-09-05 09:08:22.981287", + "step": 2491, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:23.157663", + "step": 2491, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3209587335586548, + "timestamp": "2025-09-05 09:08:23.174124", + "step": 2492, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:23.342854", + "step": 2492, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33791443705558777, + "timestamp": "2025-09-05 09:08:23.346174", + "step": 2493, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:23.510842", + "step": 2493, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5459884405136108, + "timestamp": "2025-09-05 09:08:23.513096", + "step": 2494, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:08:23.679341", + "step": 2494, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2700771987438202, + "timestamp": "2025-09-05 09:08:23.681816", + "step": 2495, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:23.847783", + "step": 2495, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30430659651756287, + "timestamp": "2025-09-05 09:08:23.858129", + "step": 2496, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:23.992946", + "step": 2496, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26771894097328186, + "timestamp": "2025-09-05 09:08:23.995222", + "step": 2497, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:24.257840", + "step": 2497, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.32667091488838196, + "timestamp": "2025-09-05 09:08:24.300330", + "step": 2498, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:24.522074", + "step": 2498, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.29449260234832764, + "timestamp": "2025-09-05 09:08:24.524635", + "step": 2499, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:24.689395", + "step": 2499, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2695910930633545, + "timestamp": "2025-09-05 09:08:24.703400", + "step": 2500, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:30.131163", + "step": 2500, + "epoch": 2 + }, + { + "type": "pplx", + "content": 52.813743962987445, + "timestamp": "2025-09-05 09:08:30.133472", + "step": 2500, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:30.266354", + "step": 2500, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33899521827697754, + "timestamp": "2025-09-05 09:08:30.268500", + "step": 2501, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:30.406032", + "step": 2501, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4425645172595978, + "timestamp": "2025-09-05 09:08:30.409138", + "step": 2502, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:30.730139", + "step": 2502, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3839690387248993, + "timestamp": "2025-09-05 09:08:30.732271", + "step": 2503, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:30.944016", + "step": 2503, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.28737911581993103, + "timestamp": "2025-09-05 09:08:30.958478", + "step": 2504, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:31.147199", + "step": 2504, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5052091479301453, + "timestamp": "2025-09-05 09:08:31.171139", + "step": 2505, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:31.513127", + "step": 2505, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34135702252388, + "timestamp": "2025-09-05 09:08:31.515377", + "step": 2506, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:31.720097", + "step": 2506, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22179536521434784, + "timestamp": "2025-09-05 09:08:31.722369", + "step": 2507, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:31.890591", + "step": 2507, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3495122790336609, + "timestamp": "2025-09-05 09:08:31.907828", + "step": 2508, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:32.101530", + "step": 2508, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.252922922372818, + "timestamp": "2025-09-05 09:08:32.105574", + "step": 2509, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:32.310055", + "step": 2509, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2928535044193268, + "timestamp": "2025-09-05 09:08:32.312376", + "step": 2510, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:32.561305", + "step": 2510, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3097461462020874, + "timestamp": "2025-09-05 09:08:32.564710", + "step": 2511, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:32.772402", + "step": 2511, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3948187232017517, + "timestamp": "2025-09-05 09:08:32.789068", + "step": 2512, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:33.031818", + "step": 2512, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2657097578048706, + "timestamp": "2025-09-05 09:08:33.034507", + "step": 2513, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:33.241556", + "step": 2513, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3475334346294403, + "timestamp": "2025-09-05 09:08:33.244226", + "step": 2514, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:33.451829", + "step": 2514, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2843340039253235, + "timestamp": "2025-09-05 09:08:33.453852", + "step": 2515, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:33.658405", + "step": 2515, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4441480338573456, + "timestamp": "2025-09-05 09:08:33.673480", + "step": 2516, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:33.862983", + "step": 2516, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24662137031555176, + "timestamp": "2025-09-05 09:08:33.865600", + "step": 2517, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:34.080449", + "step": 2517, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.40211036801338196, + "timestamp": "2025-09-05 09:08:34.082800", + "step": 2518, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:34.371690", + "step": 2518, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4713238477706909, + "timestamp": "2025-09-05 09:08:34.374322", + "step": 2519, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:34.571969", + "step": 2519, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3440394401550293, + "timestamp": "2025-09-05 09:08:34.587486", + "step": 2520, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:40.157649", + "step": 2520, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.0074163999356, + "timestamp": "2025-09-05 09:08:40.159389", + "step": 2520, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2520", + "timestamp": "2025-09-05 09:08:40.670948", + "step": 2520, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:40.870996", + "step": 2520, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2662496566772461, + "timestamp": "2025-09-05 09:08:40.873982", + "step": 2521, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:41.068057", + "step": 2521, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2788788974285126, + "timestamp": "2025-09-05 09:08:41.070930", + "step": 2522, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:41.236468", + "step": 2522, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2379118651151657, + "timestamp": "2025-09-05 09:08:41.240108", + "step": 2523, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:41.445747", + "step": 2523, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2710990309715271, + "timestamp": "2025-09-05 09:08:41.460173", + "step": 2524, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:41.647392", + "step": 2524, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35253018140792847, + "timestamp": "2025-09-05 09:08:41.649673", + "step": 2525, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:41.899382", + "step": 2525, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.33853983879089355, + "timestamp": "2025-09-05 09:08:41.903036", + "step": 2526, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:42.152763", + "step": 2526, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2997421324253082, + "timestamp": "2025-09-05 09:08:42.195973", + "step": 2527, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:08:42.443160", + "step": 2527, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22982150316238403, + "timestamp": "2025-09-05 09:08:42.460245", + "step": 2528, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:42.657316", + "step": 2528, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27433791756629944, + "timestamp": "2025-09-05 09:08:42.661098", + "step": 2529, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:42.859379", + "step": 2529, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4138650894165039, + "timestamp": "2025-09-05 09:08:42.861980", + "step": 2530, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:43.057651", + "step": 2530, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.5147197842597961, + "timestamp": "2025-09-05 09:08:43.061658", + "step": 2531, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:43.258384", + "step": 2531, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3644541800022125, + "timestamp": "2025-09-05 09:08:43.277457", + "step": 2532, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:43.595788", + "step": 2532, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.21938161551952362, + "timestamp": "2025-09-05 09:08:43.598479", + "step": 2533, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:43.796759", + "step": 2533, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3870631754398346, + "timestamp": "2025-09-05 09:08:43.799743", + "step": 2534, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:43.999027", + "step": 2534, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1749730259180069, + "timestamp": "2025-09-05 09:08:44.002043", + "step": 2535, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:44.198000", + "step": 2535, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2818840742111206, + "timestamp": "2025-09-05 09:08:44.212763", + "step": 2536, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:44.400117", + "step": 2536, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22080212831497192, + "timestamp": "2025-09-05 09:08:44.403454", + "step": 2537, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:44.610140", + "step": 2537, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2120576649904251, + "timestamp": "2025-09-05 09:08:44.613452", + "step": 2538, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:44.862843", + "step": 2538, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.17697148025035858, + "timestamp": "2025-09-05 09:08:44.864931", + "step": 2539, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:45.040233", + "step": 2539, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37181296944618225, + "timestamp": "2025-09-05 09:08:45.055138", + "step": 2540, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:50.212502", + "step": 2540, + "epoch": 2 + }, + { + "type": "pplx", + "content": 53.064035598436966, + "timestamp": "2025-09-05 09:08:50.214408", + "step": 2540, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:50.346568", + "step": 2540, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3222182095050812, + "timestamp": "2025-09-05 09:08:50.348433", + "step": 2541, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:50.484589", + "step": 2541, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.15369223058223724, + "timestamp": "2025-09-05 09:08:50.486899", + "step": 2542, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:50.622707", + "step": 2542, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.37195268273353577, + "timestamp": "2025-09-05 09:08:50.624641", + "step": 2543, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:50.761152", + "step": 2543, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3791513741016388, + "timestamp": "2025-09-05 09:08:50.769844", + "step": 2544, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:50.902437", + "step": 2544, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25493812561035156, + "timestamp": "2025-09-05 09:08:50.904432", + "step": 2545, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:51.062401", + "step": 2545, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34883612394332886, + "timestamp": "2025-09-05 09:08:51.064212", + "step": 2546, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:51.235180", + "step": 2546, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36856377124786377, + "timestamp": "2025-09-05 09:08:51.237348", + "step": 2547, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:51.372850", + "step": 2547, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.27879372239112854, + "timestamp": "2025-09-05 09:08:51.386856", + "step": 2548, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:51.538291", + "step": 2548, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26352420449256897, + "timestamp": "2025-09-05 09:08:51.539972", + "step": 2549, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:51.709065", + "step": 2549, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3046532869338989, + "timestamp": "2025-09-05 09:08:51.711027", + "step": 2550, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:51.872075", + "step": 2550, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.45060494542121887, + "timestamp": "2025-09-05 09:08:51.876519", + "step": 2551, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:52.036405", + "step": 2551, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4088442027568817, + "timestamp": "2025-09-05 09:08:52.050278", + "step": 2552, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:52.201867", + "step": 2552, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2506445646286011, + "timestamp": "2025-09-05 09:08:52.203855", + "step": 2553, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:52.361528", + "step": 2553, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2608177363872528, + "timestamp": "2025-09-05 09:08:52.363491", + "step": 2554, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:52.521849", + "step": 2554, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.35635024309158325, + "timestamp": "2025-09-05 09:08:52.523750", + "step": 2555, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:52.681338", + "step": 2555, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3468969166278839, + "timestamp": "2025-09-05 09:08:52.696825", + "step": 2556, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:52.855565", + "step": 2556, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.421159029006958, + "timestamp": "2025-09-05 09:08:52.857810", + "step": 2557, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:08:53.018521", + "step": 2557, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3261510729789734, + "timestamp": "2025-09-05 09:08:53.020341", + "step": 2558, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:53.190276", + "step": 2558, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4048421084880829, + "timestamp": "2025-09-05 09:08:53.192324", + "step": 2559, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:08:53.362579", + "step": 2559, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41077789664268494, + "timestamp": "2025-09-05 09:08:53.378509", + "step": 2560, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:08:58.019506", + "step": 2560, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.037454830465165, + "timestamp": "2025-09-05 09:08:58.021631", + "step": 2560, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2560", + "timestamp": "2025-09-05 09:08:58.543085", + "step": 2560, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:58.681373", + "step": 2560, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25367656350135803, + "timestamp": "2025-09-05 09:08:58.683466", + "step": 2561, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:58.839482", + "step": 2561, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2890976667404175, + "timestamp": "2025-09-05 09:08:58.841709", + "step": 2562, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:58.999471", + "step": 2562, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3569534718990326, + "timestamp": "2025-09-05 09:08:59.001771", + "step": 2563, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:08:59.170317", + "step": 2563, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.26353394985198975, + "timestamp": "2025-09-05 09:08:59.183922", + "step": 2564, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:59.337378", + "step": 2564, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.278327077627182, + "timestamp": "2025-09-05 09:08:59.339152", + "step": 2565, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:08:59.506890", + "step": 2565, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.4738348126411438, + "timestamp": "2025-09-05 09:08:59.508929", + "step": 2566, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:08:59.666733", + "step": 2566, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.22108004987239838, + "timestamp": "2025-09-05 09:08:59.668438", + "step": 2567, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:59.824336", + "step": 2567, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3641687333583832, + "timestamp": "2025-09-05 09:08:59.838720", + "step": 2568, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:08:59.997932", + "step": 2568, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2895580232143402, + "timestamp": "2025-09-05 09:09:00.000313", + "step": 2569, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:00.135261", + "step": 2569, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3097214698791504, + "timestamp": "2025-09-05 09:09:00.137099", + "step": 2570, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:00.304405", + "step": 2570, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3036538064479828, + "timestamp": "2025-09-05 09:09:00.306729", + "step": 2571, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:00.463817", + "step": 2571, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3194674849510193, + "timestamp": "2025-09-05 09:09:00.477410", + "step": 2572, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:00.628501", + "step": 2572, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3619299829006195, + "timestamp": "2025-09-05 09:09:00.630635", + "step": 2573, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:00.790157", + "step": 2573, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2202121913433075, + "timestamp": "2025-09-05 09:09:00.792091", + "step": 2574, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:00.948683", + "step": 2574, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2178782969713211, + "timestamp": "2025-09-05 09:09:00.950541", + "step": 2575, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:01.121052", + "step": 2575, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.30588048696517944, + "timestamp": "2025-09-05 09:09:01.135032", + "step": 2576, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:09:01.288635", + "step": 2576, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.252217561006546, + "timestamp": "2025-09-05 09:09:01.290940", + "step": 2577, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:01.460305", + "step": 2577, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3836561143398285, + "timestamp": "2025-09-05 09:09:01.462184", + "step": 2578, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:01.632772", + "step": 2578, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.14467762410640717, + "timestamp": "2025-09-05 09:09:01.635088", + "step": 2579, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:01.772020", + "step": 2579, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.36755555868148804, + "timestamp": "2025-09-05 09:09:01.788192", + "step": 2580, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:06.435373", + "step": 2580, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.81730763687925, + "timestamp": "2025-09-05 09:09:06.438007", + "step": 2580, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:06.570269", + "step": 2580, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3310108780860901, + "timestamp": "2025-09-05 09:09:06.572346", + "step": 2581, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:06.736690", + "step": 2581, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3351535499095917, + "timestamp": "2025-09-05 09:09:06.738837", + "step": 2582, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:06.902060", + "step": 2582, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3105543851852417, + "timestamp": "2025-09-05 09:09:06.904103", + "step": 2583, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:07.068138", + "step": 2583, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3798165023326874, + "timestamp": "2025-09-05 09:09:07.081927", + "step": 2584, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:07.239587", + "step": 2584, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2240927517414093, + "timestamp": "2025-09-05 09:09:07.241587", + "step": 2585, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:07.446448", + "step": 2585, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.44238588213920593, + "timestamp": "2025-09-05 09:09:07.448922", + "step": 2586, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:07.617891", + "step": 2586, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3627106845378876, + "timestamp": "2025-09-05 09:09:07.620036", + "step": 2587, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:09:07.788432", + "step": 2587, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.41372790932655334, + "timestamp": "2025-09-05 09:09:07.797290", + "step": 2588, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:07.933122", + "step": 2588, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1950422078371048, + "timestamp": "2025-09-05 09:09:07.935093", + "step": 2589, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:08.070991", + "step": 2589, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.23546253144741058, + "timestamp": "2025-09-05 09:09:08.073472", + "step": 2590, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:08.251746", + "step": 2590, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.24141792953014374, + "timestamp": "2025-09-05 09:09:08.254326", + "step": 2591, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:08.425452", + "step": 2591, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3847906291484833, + "timestamp": "2025-09-05 09:09:08.439722", + "step": 2592, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:08.634658", + "step": 2592, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.34318065643310547, + "timestamp": "2025-09-05 09:09:08.637294", + "step": 2593, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:08.822224", + "step": 2593, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.3513537645339966, + "timestamp": "2025-09-05 09:09:08.824692", + "step": 2594, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:09.010563", + "step": 2594, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2467866986989975, + "timestamp": "2025-09-05 09:09:09.014666", + "step": 2595, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:09.204328", + "step": 2595, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.31491708755493164, + "timestamp": "2025-09-05 09:09:09.221207", + "step": 2596, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:09.387748", + "step": 2596, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.42033904790878296, + "timestamp": "2025-09-05 09:09:09.390084", + "step": 2597, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:09.556120", + "step": 2597, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.25425952672958374, + "timestamp": "2025-09-05 09:09:09.558324", + "step": 2598, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:09.721976", + "step": 2598, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2386419177055359, + "timestamp": "2025-09-05 09:09:09.724701", + "step": 2599, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:09.888119", + "step": 2599, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1672760248184204, + "timestamp": "2025-09-05 09:09:09.902009", + "step": 2600, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:14.531558", + "step": 2600, + "epoch": 2 + }, + { + "type": "pplx", + "content": 54.832467462479364, + "timestamp": "2025-09-05 09:09:14.533780", + "step": 2600, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2600", + "timestamp": "2025-09-05 09:09:15.004154", + "step": 2600, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:15.143984", + "step": 2600, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2909306585788727, + "timestamp": "2025-09-05 09:09:15.145976", + "step": 2601, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:15.310389", + "step": 2601, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.369361937046051, + "timestamp": "2025-09-05 09:09:15.312190", + "step": 2602, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:15.475409", + "step": 2602, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2690865099430084, + "timestamp": "2025-09-05 09:09:15.477319", + "step": 2603, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:15.649941", + "step": 2603, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41636621952056885, + "timestamp": "2025-09-05 09:09:15.664165", + "step": 2604, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:15.829025", + "step": 2604, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24433229863643646, + "timestamp": "2025-09-05 09:09:15.830945", + "step": 2605, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:15.996184", + "step": 2605, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3325082063674927, + "timestamp": "2025-09-05 09:09:15.998155", + "step": 2606, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:16.161457", + "step": 2606, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2946546971797943, + "timestamp": "2025-09-05 09:09:16.163593", + "step": 2607, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:16.327289", + "step": 2607, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.46848660707473755, + "timestamp": "2025-09-05 09:09:16.341734", + "step": 2608, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:16.498726", + "step": 2608, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39881542325019836, + "timestamp": "2025-09-05 09:09:16.502038", + "step": 2609, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:16.669668", + "step": 2609, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3879721462726593, + "timestamp": "2025-09-05 09:09:16.671710", + "step": 2610, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:16.844414", + "step": 2610, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3810281455516815, + "timestamp": "2025-09-05 09:09:16.846276", + "step": 2611, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:17.020193", + "step": 2611, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41419529914855957, + "timestamp": "2025-09-05 09:09:17.034287", + "step": 2612, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:17.192268", + "step": 2612, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.296019047498703, + "timestamp": "2025-09-05 09:09:17.195977", + "step": 2613, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:17.359277", + "step": 2613, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39310163259506226, + "timestamp": "2025-09-05 09:09:17.361128", + "step": 2614, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:17.519311", + "step": 2614, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2674983739852905, + "timestamp": "2025-09-05 09:09:17.521442", + "step": 2615, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:17.678767", + "step": 2615, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34225496649742126, + "timestamp": "2025-09-05 09:09:17.692054", + "step": 2616, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:17.843075", + "step": 2616, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36751917004585266, + "timestamp": "2025-09-05 09:09:17.845103", + "step": 2617, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:18.001589", + "step": 2617, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28645431995391846, + "timestamp": "2025-09-05 09:09:18.003740", + "step": 2618, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:18.163572", + "step": 2618, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27839794754981995, + "timestamp": "2025-09-05 09:09:18.165439", + "step": 2619, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:18.322425", + "step": 2619, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28111451864242554, + "timestamp": "2025-09-05 09:09:18.336168", + "step": 2620, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:22.963303", + "step": 2620, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.74175043223115, + "timestamp": "2025-09-05 09:09:22.965443", + "step": 2620, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:23.097523", + "step": 2620, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41243675351142883, + "timestamp": "2025-09-05 09:09:23.099641", + "step": 2621, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:23.236457", + "step": 2621, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4278426766395569, + "timestamp": "2025-09-05 09:09:23.238298", + "step": 2622, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:23.374088", + "step": 2622, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25342079997062683, + "timestamp": "2025-09-05 09:09:23.376267", + "step": 2623, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:23.513612", + "step": 2623, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4077974259853363, + "timestamp": "2025-09-05 09:09:23.522957", + "step": 2624, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:23.657452", + "step": 2624, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2912677526473999, + "timestamp": "2025-09-05 09:09:23.659379", + "step": 2625, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:23.794514", + "step": 2625, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28832995891571045, + "timestamp": "2025-09-05 09:09:23.796204", + "step": 2626, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:23.931743", + "step": 2626, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3055991530418396, + "timestamp": "2025-09-05 09:09:23.934138", + "step": 2627, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:24.103487", + "step": 2627, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3616314232349396, + "timestamp": "2025-09-05 09:09:24.118124", + "step": 2628, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:24.272435", + "step": 2628, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29930540919303894, + "timestamp": "2025-09-05 09:09:24.274499", + "step": 2629, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:24.432410", + "step": 2629, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27770620584487915, + "timestamp": "2025-09-05 09:09:24.434428", + "step": 2630, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:24.603178", + "step": 2630, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4240753948688507, + "timestamp": "2025-09-05 09:09:24.604981", + "step": 2631, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:24.739831", + "step": 2631, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2943328619003296, + "timestamp": "2025-09-05 09:09:24.755715", + "step": 2632, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:24.915341", + "step": 2632, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28002142906188965, + "timestamp": "2025-09-05 09:09:24.917749", + "step": 2633, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:25.074494", + "step": 2633, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3681841194629669, + "timestamp": "2025-09-05 09:09:25.076991", + "step": 2634, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:25.242111", + "step": 2634, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3582687973976135, + "timestamp": "2025-09-05 09:09:25.244392", + "step": 2635, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:25.403267", + "step": 2635, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38165122270584106, + "timestamp": "2025-09-05 09:09:25.417274", + "step": 2636, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:25.569052", + "step": 2636, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3990744352340698, + "timestamp": "2025-09-05 09:09:25.571190", + "step": 2637, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:25.733478", + "step": 2637, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33937230706214905, + "timestamp": "2025-09-05 09:09:25.735736", + "step": 2638, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:25.911457", + "step": 2638, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3162641227245331, + "timestamp": "2025-09-05 09:09:25.913737", + "step": 2639, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:26.072299", + "step": 2639, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2395915687084198, + "timestamp": "2025-09-05 09:09:26.088449", + "step": 2640, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:30.715372", + "step": 2640, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.28784188000765, + "timestamp": "2025-09-05 09:09:30.717835", + "step": 2640, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2640", + "timestamp": "2025-09-05 09:09:31.194816", + "step": 2640, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:31.341771", + "step": 2640, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3118651211261749, + "timestamp": "2025-09-05 09:09:31.344881", + "step": 2641, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:31.503551", + "step": 2641, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33012741804122925, + "timestamp": "2025-09-05 09:09:31.505717", + "step": 2642, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:31.662794", + "step": 2642, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.376676470041275, + "timestamp": "2025-09-05 09:09:31.665311", + "step": 2643, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:31.833512", + "step": 2643, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30343472957611084, + "timestamp": "2025-09-05 09:09:31.847642", + "step": 2644, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:32.002431", + "step": 2644, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3581269681453705, + "timestamp": "2025-09-05 09:09:32.004501", + "step": 2645, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:32.173097", + "step": 2645, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22369900345802307, + "timestamp": "2025-09-05 09:09:32.174951", + "step": 2646, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:32.309856", + "step": 2646, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3513410985469818, + "timestamp": "2025-09-05 09:09:32.312072", + "step": 2647, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:32.481357", + "step": 2647, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44361498951911926, + "timestamp": "2025-09-05 09:09:32.495464", + "step": 2648, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:32.649538", + "step": 2648, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2907888889312744, + "timestamp": "2025-09-05 09:09:32.651742", + "step": 2649, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:32.808844", + "step": 2649, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37796589732170105, + "timestamp": "2025-09-05 09:09:32.810776", + "step": 2650, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:32.968897", + "step": 2650, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29424649477005005, + "timestamp": "2025-09-05 09:09:32.970994", + "step": 2651, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:33.129396", + "step": 2651, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36393117904663086, + "timestamp": "2025-09-05 09:09:33.145047", + "step": 2652, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:33.310349", + "step": 2652, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3564506769180298, + "timestamp": "2025-09-05 09:09:33.312321", + "step": 2653, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:33.478108", + "step": 2653, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20433595776557922, + "timestamp": "2025-09-05 09:09:33.479956", + "step": 2654, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:33.638178", + "step": 2654, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38941457867622375, + "timestamp": "2025-09-05 09:09:33.641092", + "step": 2655, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:33.798976", + "step": 2655, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.276498019695282, + "timestamp": "2025-09-05 09:09:33.813114", + "step": 2656, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:33.964685", + "step": 2656, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22760221362113953, + "timestamp": "2025-09-05 09:09:33.966648", + "step": 2657, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:34.133765", + "step": 2657, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39622488617897034, + "timestamp": "2025-09-05 09:09:34.135682", + "step": 2658, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:34.293759", + "step": 2658, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3628631830215454, + "timestamp": "2025-09-05 09:09:34.295822", + "step": 2659, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:34.453730", + "step": 2659, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28753095865249634, + "timestamp": "2025-09-05 09:09:34.467476", + "step": 2660, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:39.114526", + "step": 2660, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.31525715361568, + "timestamp": "2025-09-05 09:09:39.116914", + "step": 2660, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:39.253139", + "step": 2660, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25859639048576355, + "timestamp": "2025-09-05 09:09:39.264102", + "step": 2661, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:39.404262", + "step": 2661, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23019303381443024, + "timestamp": "2025-09-05 09:09:39.407220", + "step": 2662, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:39.581353", + "step": 2662, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5155594348907471, + "timestamp": "2025-09-05 09:09:39.583174", + "step": 2663, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:39.741092", + "step": 2663, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37276023626327515, + "timestamp": "2025-09-05 09:09:39.755490", + "step": 2664, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:39.907403", + "step": 2664, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41703230142593384, + "timestamp": "2025-09-05 09:09:39.909321", + "step": 2665, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:40.066875", + "step": 2665, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2926308512687683, + "timestamp": "2025-09-05 09:09:40.068643", + "step": 2666, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:40.226508", + "step": 2666, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4091567099094391, + "timestamp": "2025-09-05 09:09:40.228193", + "step": 2667, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:40.397210", + "step": 2667, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26959991455078125, + "timestamp": "2025-09-05 09:09:40.414775", + "step": 2668, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:40.572190", + "step": 2668, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2950553894042969, + "timestamp": "2025-09-05 09:09:40.587593", + "step": 2669, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:40.769007", + "step": 2669, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35082414746284485, + "timestamp": "2025-09-05 09:09:40.771348", + "step": 2670, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:40.933475", + "step": 2670, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2740470767021179, + "timestamp": "2025-09-05 09:09:40.935596", + "step": 2671, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:41.106478", + "step": 2671, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3159100115299225, + "timestamp": "2025-09-05 09:09:41.115332", + "step": 2672, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:41.249508", + "step": 2672, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4648512303829193, + "timestamp": "2025-09-05 09:09:41.251877", + "step": 2673, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:41.409328", + "step": 2673, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3527366518974304, + "timestamp": "2025-09-05 09:09:41.411381", + "step": 2674, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:41.584798", + "step": 2674, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26771530508995056, + "timestamp": "2025-09-05 09:09:41.586705", + "step": 2675, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:41.745088", + "step": 2675, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34135565161705017, + "timestamp": "2025-09-05 09:09:41.761326", + "step": 2676, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:41.914539", + "step": 2676, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35681870579719543, + "timestamp": "2025-09-05 09:09:41.920362", + "step": 2677, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:42.083593", + "step": 2677, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22552332282066345, + "timestamp": "2025-09-05 09:09:42.085921", + "step": 2678, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:42.255612", + "step": 2678, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24503037333488464, + "timestamp": "2025-09-05 09:09:42.257451", + "step": 2679, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:42.394045", + "step": 2679, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.228922039270401, + "timestamp": "2025-09-05 09:09:42.410435", + "step": 2680, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:47.123710", + "step": 2680, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.31896988097937, + "timestamp": "2025-09-05 09:09:47.126173", + "step": 2680, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2680", + "timestamp": "2025-09-05 09:09:47.641795", + "step": 2680, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:47.775868", + "step": 2680, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3755874037742615, + "timestamp": "2025-09-05 09:09:47.778167", + "step": 2681, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:47.915328", + "step": 2681, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30692851543426514, + "timestamp": "2025-09-05 09:09:47.917502", + "step": 2682, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:48.087264", + "step": 2682, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41206371784210205, + "timestamp": "2025-09-05 09:09:48.089321", + "step": 2683, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:48.249641", + "step": 2683, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38735464215278625, + "timestamp": "2025-09-05 09:09:48.263554", + "step": 2684, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:48.415430", + "step": 2684, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27017661929130554, + "timestamp": "2025-09-05 09:09:48.417668", + "step": 2685, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:48.553584", + "step": 2685, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3175680637359619, + "timestamp": "2025-09-05 09:09:48.555889", + "step": 2686, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:48.723796", + "step": 2686, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3680326044559479, + "timestamp": "2025-09-05 09:09:48.725538", + "step": 2687, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:48.885954", + "step": 2687, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29450035095214844, + "timestamp": "2025-09-05 09:09:48.900149", + "step": 2688, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:49.056183", + "step": 2688, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4046458303928375, + "timestamp": "2025-09-05 09:09:49.058643", + "step": 2689, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:49.216287", + "step": 2689, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3243006765842438, + "timestamp": "2025-09-05 09:09:49.218640", + "step": 2690, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:49.376779", + "step": 2690, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2823656499385834, + "timestamp": "2025-09-05 09:09:49.379245", + "step": 2691, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:49.557827", + "step": 2691, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3415341079235077, + "timestamp": "2025-09-05 09:09:49.574030", + "step": 2692, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:49.735573", + "step": 2692, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20649684965610504, + "timestamp": "2025-09-05 09:09:49.737926", + "step": 2693, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:49.897370", + "step": 2693, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41073697805404663, + "timestamp": "2025-09-05 09:09:49.902347", + "step": 2694, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:50.080020", + "step": 2694, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4066583514213562, + "timestamp": "2025-09-05 09:09:50.083816", + "step": 2695, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:50.257701", + "step": 2695, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43231305480003357, + "timestamp": "2025-09-05 09:09:50.271674", + "step": 2696, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:50.425120", + "step": 2696, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43370968103408813, + "timestamp": "2025-09-05 09:09:50.427885", + "step": 2697, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:50.587228", + "step": 2697, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3017624616622925, + "timestamp": "2025-09-05 09:09:50.589376", + "step": 2698, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:50.747465", + "step": 2698, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.49601367115974426, + "timestamp": "2025-09-05 09:09:50.749698", + "step": 2699, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:50.919572", + "step": 2699, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3051532208919525, + "timestamp": "2025-09-05 09:09:50.934651", + "step": 2700, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:09:55.581191", + "step": 2700, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.71914685947849, + "timestamp": "2025-09-05 09:09:55.583324", + "step": 2700, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:55.715082", + "step": 2700, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24363669753074646, + "timestamp": "2025-09-05 09:09:55.717265", + "step": 2701, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:09:55.873487", + "step": 2701, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3039955198764801, + "timestamp": "2025-09-05 09:09:55.875289", + "step": 2702, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:56.042793", + "step": 2702, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3395662307739258, + "timestamp": "2025-09-05 09:09:56.044668", + "step": 2703, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:56.180369", + "step": 2703, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23626942932605743, + "timestamp": "2025-09-05 09:09:56.196460", + "step": 2704, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:56.356335", + "step": 2704, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3257940411567688, + "timestamp": "2025-09-05 09:09:56.358374", + "step": 2705, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:56.516397", + "step": 2705, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3324272930622101, + "timestamp": "2025-09-05 09:09:56.519005", + "step": 2706, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:56.658125", + "step": 2706, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18577173352241516, + "timestamp": "2025-09-05 09:09:56.661453", + "step": 2707, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:56.843395", + "step": 2707, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3146699070930481, + "timestamp": "2025-09-05 09:09:56.860248", + "step": 2708, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:09:57.033006", + "step": 2708, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23967741429805756, + "timestamp": "2025-09-05 09:09:57.036268", + "step": 2709, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:57.212345", + "step": 2709, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23511698842048645, + "timestamp": "2025-09-05 09:09:57.214817", + "step": 2710, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:57.392545", + "step": 2710, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28559327125549316, + "timestamp": "2025-09-05 09:09:57.394953", + "step": 2711, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:57.569750", + "step": 2711, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5006580352783203, + "timestamp": "2025-09-05 09:09:57.584025", + "step": 2712, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:57.742993", + "step": 2712, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2922325134277344, + "timestamp": "2025-09-05 09:09:57.751655", + "step": 2713, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:09:57.922784", + "step": 2713, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17340846359729767, + "timestamp": "2025-09-05 09:09:57.930195", + "step": 2714, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:09:58.110260", + "step": 2714, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22841380536556244, + "timestamp": "2025-09-05 09:09:58.113975", + "step": 2715, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:58.276525", + "step": 2715, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3006599247455597, + "timestamp": "2025-09-05 09:09:58.293741", + "step": 2716, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:09:58.459743", + "step": 2716, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3553715646266937, + "timestamp": "2025-09-05 09:09:58.464739", + "step": 2717, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:58.636391", + "step": 2717, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31938436627388, + "timestamp": "2025-09-05 09:09:58.639450", + "step": 2718, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:09:58.819436", + "step": 2718, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22373679280281067, + "timestamp": "2025-09-05 09:09:58.822596", + "step": 2719, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:09:59.011310", + "step": 2719, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20499032735824585, + "timestamp": "2025-09-05 09:09:59.033254", + "step": 2720, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:03.869184", + "step": 2720, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.399467890917855, + "timestamp": "2025-09-05 09:10:03.871173", + "step": 2720, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2720", + "timestamp": "2025-09-05 09:10:04.314045", + "step": 2720, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:04.453237", + "step": 2720, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3693501055240631, + "timestamp": "2025-09-05 09:10:04.455637", + "step": 2721, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:04.623810", + "step": 2721, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2824878990650177, + "timestamp": "2025-09-05 09:10:04.627198", + "step": 2722, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:04.787875", + "step": 2722, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.307536244392395, + "timestamp": "2025-09-05 09:10:04.790194", + "step": 2723, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:04.950797", + "step": 2723, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2077270895242691, + "timestamp": "2025-09-05 09:10:04.959505", + "step": 2724, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:05.097999", + "step": 2724, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2842904031276703, + "timestamp": "2025-09-05 09:10:05.102783", + "step": 2725, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:05.262814", + "step": 2725, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30767548084259033, + "timestamp": "2025-09-05 09:10:05.266837", + "step": 2726, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:05.426245", + "step": 2726, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41483354568481445, + "timestamp": "2025-09-05 09:10:05.429925", + "step": 2727, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:05.592194", + "step": 2727, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2677007019519806, + "timestamp": "2025-09-05 09:10:05.606152", + "step": 2728, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:05.761691", + "step": 2728, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3418111801147461, + "timestamp": "2025-09-05 09:10:05.763999", + "step": 2729, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:05.901368", + "step": 2729, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26328641176223755, + "timestamp": "2025-09-05 09:10:05.905885", + "step": 2730, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:06.077778", + "step": 2730, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.287534236907959, + "timestamp": "2025-09-05 09:10:06.079870", + "step": 2731, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:06.218060", + "step": 2731, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23290354013442993, + "timestamp": "2025-09-05 09:10:06.232682", + "step": 2732, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:10:06.387980", + "step": 2732, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44484513998031616, + "timestamp": "2025-09-05 09:10:06.390645", + "step": 2733, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:06.560661", + "step": 2733, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3004152774810791, + "timestamp": "2025-09-05 09:10:06.565345", + "step": 2734, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:06.728713", + "step": 2734, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23439401388168335, + "timestamp": "2025-09-05 09:10:06.731721", + "step": 2735, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:06.906635", + "step": 2735, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39753445982933044, + "timestamp": "2025-09-05 09:10:06.921311", + "step": 2736, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:07.081558", + "step": 2736, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26311197876930237, + "timestamp": "2025-09-05 09:10:07.084292", + "step": 2737, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:07.246437", + "step": 2737, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2834315896034241, + "timestamp": "2025-09-05 09:10:07.249366", + "step": 2738, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:07.421327", + "step": 2738, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35053861141204834, + "timestamp": "2025-09-05 09:10:07.424016", + "step": 2739, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:07.601132", + "step": 2739, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2513752579689026, + "timestamp": "2025-09-05 09:10:07.618283", + "step": 2740, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:12.409186", + "step": 2740, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.606292247536096, + "timestamp": "2025-09-05 09:10:12.416713", + "step": 2740, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:12.551578", + "step": 2740, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2897755801677704, + "timestamp": "2025-09-05 09:10:12.558344", + "step": 2741, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:12.698223", + "step": 2741, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3819401264190674, + "timestamp": "2025-09-05 09:10:12.701247", + "step": 2742, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:12.841144", + "step": 2742, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3809162378311157, + "timestamp": "2025-09-05 09:10:12.847553", + "step": 2743, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:10:12.986801", + "step": 2743, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27368873357772827, + "timestamp": "2025-09-05 09:10:12.996377", + "step": 2744, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:13.135351", + "step": 2744, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33143290877342224, + "timestamp": "2025-09-05 09:10:13.144607", + "step": 2745, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:13.288483", + "step": 2745, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23653104901313782, + "timestamp": "2025-09-05 09:10:13.293088", + "step": 2746, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:13.468711", + "step": 2746, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37218546867370605, + "timestamp": "2025-09-05 09:10:13.471453", + "step": 2747, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:13.645855", + "step": 2747, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27283674478530884, + "timestamp": "2025-09-05 09:10:13.662936", + "step": 2748, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:13.822337", + "step": 2748, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4320858120918274, + "timestamp": "2025-09-05 09:10:13.825185", + "step": 2749, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:14.000505", + "step": 2749, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2452457696199417, + "timestamp": "2025-09-05 09:10:14.014447", + "step": 2750, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:14.182660", + "step": 2750, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1904885619878769, + "timestamp": "2025-09-05 09:10:14.185589", + "step": 2751, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:14.350431", + "step": 2751, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3453499376773834, + "timestamp": "2025-09-05 09:10:14.365541", + "step": 2752, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:14.524726", + "step": 2752, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.267494261264801, + "timestamp": "2025-09-05 09:10:14.528420", + "step": 2753, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:14.716589", + "step": 2753, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43331682682037354, + "timestamp": "2025-09-05 09:10:14.720911", + "step": 2754, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:14.901813", + "step": 2754, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3932355046272278, + "timestamp": "2025-09-05 09:10:14.905171", + "step": 2755, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:15.085460", + "step": 2755, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3386424779891968, + "timestamp": "2025-09-05 09:10:15.102279", + "step": 2756, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:15.279114", + "step": 2756, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3107830584049225, + "timestamp": "2025-09-05 09:10:15.282876", + "step": 2757, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:15.461908", + "step": 2757, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2651340961456299, + "timestamp": "2025-09-05 09:10:15.465681", + "step": 2758, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:15.630422", + "step": 2758, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4322452247142792, + "timestamp": "2025-09-05 09:10:15.639175", + "step": 2759, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:15.802596", + "step": 2759, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3348475694656372, + "timestamp": "2025-09-05 09:10:15.819849", + "step": 2760, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:20.767860", + "step": 2760, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.56754759928199, + "timestamp": "2025-09-05 09:10:20.772035", + "step": 2760, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2760", + "timestamp": "2025-09-05 09:10:21.365515", + "step": 2760, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:21.595230", + "step": 2760, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23760850727558136, + "timestamp": "2025-09-05 09:10:21.602522", + "step": 2761, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:21.765693", + "step": 2761, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3854387402534485, + "timestamp": "2025-09-05 09:10:21.776647", + "step": 2762, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:10:21.940738", + "step": 2762, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3008959889411926, + "timestamp": "2025-09-05 09:10:21.944579", + "step": 2763, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:22.108064", + "step": 2763, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2746279537677765, + "timestamp": "2025-09-05 09:10:22.124956", + "step": 2764, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:22.294008", + "step": 2764, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28841450810432434, + "timestamp": "2025-09-05 09:10:22.296980", + "step": 2765, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:22.472863", + "step": 2765, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20862191915512085, + "timestamp": "2025-09-05 09:10:22.475833", + "step": 2766, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:22.639204", + "step": 2766, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2936626076698303, + "timestamp": "2025-09-05 09:10:22.648560", + "step": 2767, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:22.821191", + "step": 2767, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3055102825164795, + "timestamp": "2025-09-05 09:10:22.846361", + "step": 2768, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:23.025985", + "step": 2768, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42737877368927, + "timestamp": "2025-09-05 09:10:23.037055", + "step": 2769, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:23.210901", + "step": 2769, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28168973326683044, + "timestamp": "2025-09-05 09:10:23.214976", + "step": 2770, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:23.404027", + "step": 2770, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4235506057739258, + "timestamp": "2025-09-05 09:10:23.407613", + "step": 2771, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:23.588220", + "step": 2771, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28592631220817566, + "timestamp": "2025-09-05 09:10:23.605025", + "step": 2772, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:23.773596", + "step": 2772, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2634555995464325, + "timestamp": "2025-09-05 09:10:23.776889", + "step": 2773, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:23.945473", + "step": 2773, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21984124183654785, + "timestamp": "2025-09-05 09:10:23.947745", + "step": 2774, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:24.086416", + "step": 2774, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20653694868087769, + "timestamp": "2025-09-05 09:10:24.090554", + "step": 2775, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:24.259486", + "step": 2775, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2279079109430313, + "timestamp": "2025-09-05 09:10:24.275952", + "step": 2776, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:24.442371", + "step": 2776, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38293153047561646, + "timestamp": "2025-09-05 09:10:24.447664", + "step": 2777, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:24.631220", + "step": 2777, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2632295787334442, + "timestamp": "2025-09-05 09:10:24.634597", + "step": 2778, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:24.809574", + "step": 2778, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3497149348258972, + "timestamp": "2025-09-05 09:10:24.812800", + "step": 2779, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:24.977713", + "step": 2779, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3435024619102478, + "timestamp": "2025-09-05 09:10:24.995774", + "step": 2780, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:29.786411", + "step": 2780, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.502120442422544, + "timestamp": "2025-09-05 09:10:29.790346", + "step": 2780, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:10:29.922857", + "step": 2780, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2520749866962433, + "timestamp": "2025-09-05 09:10:29.932096", + "step": 2781, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:30.071586", + "step": 2781, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2531195282936096, + "timestamp": "2025-09-05 09:10:30.074133", + "step": 2782, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:30.213336", + "step": 2782, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41319963335990906, + "timestamp": "2025-09-05 09:10:30.218550", + "step": 2783, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:30.360495", + "step": 2783, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25934308767318726, + "timestamp": "2025-09-05 09:10:30.376438", + "step": 2784, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:30.514182", + "step": 2784, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16343851387500763, + "timestamp": "2025-09-05 09:10:30.517315", + "step": 2785, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:30.657243", + "step": 2785, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34972044825553894, + "timestamp": "2025-09-05 09:10:30.660463", + "step": 2786, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:30.800154", + "step": 2786, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3237220346927643, + "timestamp": "2025-09-05 09:10:30.806393", + "step": 2787, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:30.948423", + "step": 2787, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2645550072193146, + "timestamp": "2025-09-05 09:10:30.959681", + "step": 2788, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:31.095759", + "step": 2788, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3123627305030823, + "timestamp": "2025-09-05 09:10:31.099566", + "step": 2789, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:31.249554", + "step": 2789, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2884657680988312, + "timestamp": "2025-09-05 09:10:31.252646", + "step": 2790, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:31.431942", + "step": 2790, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22808344662189484, + "timestamp": "2025-09-05 09:10:31.434481", + "step": 2791, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:31.608196", + "step": 2791, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4153204560279846, + "timestamp": "2025-09-05 09:10:31.627058", + "step": 2792, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:31.788114", + "step": 2792, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24033991992473602, + "timestamp": "2025-09-05 09:10:31.790509", + "step": 2793, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:31.966073", + "step": 2793, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2547632157802582, + "timestamp": "2025-09-05 09:10:31.970354", + "step": 2794, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:32.145056", + "step": 2794, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3763968050479889, + "timestamp": "2025-09-05 09:10:32.147688", + "step": 2795, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:32.310468", + "step": 2795, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4241531491279602, + "timestamp": "2025-09-05 09:10:32.324920", + "step": 2796, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:32.478476", + "step": 2796, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2867893874645233, + "timestamp": "2025-09-05 09:10:32.481606", + "step": 2797, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:32.645290", + "step": 2797, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2623746991157532, + "timestamp": "2025-09-05 09:10:32.650158", + "step": 2798, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:32.829402", + "step": 2798, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.353633314371109, + "timestamp": "2025-09-05 09:10:32.832494", + "step": 2799, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:33.007499", + "step": 2799, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3284893035888672, + "timestamp": "2025-09-05 09:10:33.023668", + "step": 2800, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:37.838708", + "step": 2800, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.313607306551525, + "timestamp": "2025-09-05 09:10:37.840865", + "step": 2800, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2800", + "timestamp": "2025-09-05 09:10:38.386248", + "step": 2800, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:38.566979", + "step": 2800, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2466062307357788, + "timestamp": "2025-09-05 09:10:38.574310", + "step": 2801, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:38.797138", + "step": 2801, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.297829270362854, + "timestamp": "2025-09-05 09:10:38.801752", + "step": 2802, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:39.027938", + "step": 2802, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3962607681751251, + "timestamp": "2025-09-05 09:10:39.037773", + "step": 2803, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:39.256098", + "step": 2803, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23827433586120605, + "timestamp": "2025-09-05 09:10:39.273813", + "step": 2804, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:39.486250", + "step": 2804, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34505435824394226, + "timestamp": "2025-09-05 09:10:39.498012", + "step": 2805, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:39.701452", + "step": 2805, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17793166637420654, + "timestamp": "2025-09-05 09:10:39.706284", + "step": 2806, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:39.936500", + "step": 2806, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.387498140335083, + "timestamp": "2025-09-05 09:10:39.943246", + "step": 2807, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:40.141959", + "step": 2807, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25481945276260376, + "timestamp": "2025-09-05 09:10:40.161190", + "step": 2808, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:40.369536", + "step": 2808, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22373159229755402, + "timestamp": "2025-09-05 09:10:40.373589", + "step": 2809, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:40.584720", + "step": 2809, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2652718126773834, + "timestamp": "2025-09-05 09:10:40.592769", + "step": 2810, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:40.811016", + "step": 2810, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24120259284973145, + "timestamp": "2025-09-05 09:10:40.821878", + "step": 2811, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:40.994024", + "step": 2811, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29725131392478943, + "timestamp": "2025-09-05 09:10:41.010223", + "step": 2812, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:41.187657", + "step": 2812, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24363994598388672, + "timestamp": "2025-09-05 09:10:41.193535", + "step": 2813, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:41.369512", + "step": 2813, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29551854729652405, + "timestamp": "2025-09-05 09:10:41.374421", + "step": 2814, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:41.645614", + "step": 2814, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3614238202571869, + "timestamp": "2025-09-05 09:10:41.649105", + "step": 2815, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:41.827736", + "step": 2815, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4348675310611725, + "timestamp": "2025-09-05 09:10:41.847192", + "step": 2816, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:42.024160", + "step": 2816, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26274287700653076, + "timestamp": "2025-09-05 09:10:42.026226", + "step": 2817, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:42.201657", + "step": 2817, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28489479422569275, + "timestamp": "2025-09-05 09:10:42.204164", + "step": 2818, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:42.382200", + "step": 2818, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3202391564846039, + "timestamp": "2025-09-05 09:10:42.385910", + "step": 2819, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:42.566485", + "step": 2819, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31756865978240967, + "timestamp": "2025-09-05 09:10:42.583875", + "step": 2820, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:48.047056", + "step": 2820, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.04747695124036, + "timestamp": "2025-09-05 09:10:48.051916", + "step": 2820, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:48.232338", + "step": 2820, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2091667205095291, + "timestamp": "2025-09-05 09:10:48.235792", + "step": 2821, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:48.446456", + "step": 2821, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39467981457710266, + "timestamp": "2025-09-05 09:10:48.449068", + "step": 2822, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:48.647493", + "step": 2822, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2453453540802002, + "timestamp": "2025-09-05 09:10:48.651805", + "step": 2823, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:48.851775", + "step": 2823, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3213421404361725, + "timestamp": "2025-09-05 09:10:48.870243", + "step": 2824, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:49.067797", + "step": 2824, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.360073447227478, + "timestamp": "2025-09-05 09:10:49.071550", + "step": 2825, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:49.288504", + "step": 2825, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3350084125995636, + "timestamp": "2025-09-05 09:10:49.293974", + "step": 2826, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:49.517532", + "step": 2826, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2850082218647003, + "timestamp": "2025-09-05 09:10:49.521509", + "step": 2827, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:49.734675", + "step": 2827, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24017834663391113, + "timestamp": "2025-09-05 09:10:49.750677", + "step": 2828, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:49.956360", + "step": 2828, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38482263684272766, + "timestamp": "2025-09-05 09:10:49.958550", + "step": 2829, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:50.165235", + "step": 2829, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.382106751203537, + "timestamp": "2025-09-05 09:10:50.169821", + "step": 2830, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:50.376138", + "step": 2830, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2114604413509369, + "timestamp": "2025-09-05 09:10:50.378830", + "step": 2831, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:50.589159", + "step": 2831, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2949742078781128, + "timestamp": "2025-09-05 09:10:50.605645", + "step": 2832, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:50.844757", + "step": 2832, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19070105254650116, + "timestamp": "2025-09-05 09:10:50.847226", + "step": 2833, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:51.046271", + "step": 2833, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23265816271305084, + "timestamp": "2025-09-05 09:10:51.049494", + "step": 2834, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:51.214113", + "step": 2834, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3064287602901459, + "timestamp": "2025-09-05 09:10:51.221071", + "step": 2835, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:51.431369", + "step": 2835, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33445465564727783, + "timestamp": "2025-09-05 09:10:51.446234", + "step": 2836, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:51.644763", + "step": 2836, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2877272665500641, + "timestamp": "2025-09-05 09:10:51.646928", + "step": 2837, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:10:51.846453", + "step": 2837, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2460276037454605, + "timestamp": "2025-09-05 09:10:51.851098", + "step": 2838, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:52.053185", + "step": 2838, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.47292837500572205, + "timestamp": "2025-09-05 09:10:52.056120", + "step": 2839, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:52.253361", + "step": 2839, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23295848071575165, + "timestamp": "2025-09-05 09:10:52.272148", + "step": 2840, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:10:57.469162", + "step": 2840, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.21594373110914, + "timestamp": "2025-09-05 09:10:57.473155", + "step": 2840, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2840", + "timestamp": "2025-09-05 09:10:57.984109", + "step": 2840, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:58.122877", + "step": 2840, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31002530455589294, + "timestamp": "2025-09-05 09:10:58.127195", + "step": 2841, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:58.305422", + "step": 2841, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28681862354278564, + "timestamp": "2025-09-05 09:10:58.350144", + "step": 2842, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:58.535226", + "step": 2842, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44048944115638733, + "timestamp": "2025-09-05 09:10:58.576570", + "step": 2843, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:58.866502", + "step": 2843, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4286167025566101, + "timestamp": "2025-09-05 09:10:58.881406", + "step": 2844, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:10:59.144763", + "step": 2844, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28463494777679443, + "timestamp": "2025-09-05 09:10:59.149799", + "step": 2845, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:10:59.319538", + "step": 2845, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2371792048215866, + "timestamp": "2025-09-05 09:10:59.321865", + "step": 2846, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:59.494662", + "step": 2846, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27211883664131165, + "timestamp": "2025-09-05 09:10:59.525047", + "step": 2847, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:10:59.758416", + "step": 2847, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2545848488807678, + "timestamp": "2025-09-05 09:10:59.776991", + "step": 2848, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:10:59.931962", + "step": 2848, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24429751932621002, + "timestamp": "2025-09-05 09:10:59.935000", + "step": 2849, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:00.096442", + "step": 2849, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20609146356582642, + "timestamp": "2025-09-05 09:11:00.099578", + "step": 2850, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:00.262502", + "step": 2850, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.274873286485672, + "timestamp": "2025-09-05 09:11:00.306226", + "step": 2851, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:00.478179", + "step": 2851, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24983830749988556, + "timestamp": "2025-09-05 09:11:00.501042", + "step": 2852, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:00.666751", + "step": 2852, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2654348611831665, + "timestamp": "2025-09-05 09:11:00.670801", + "step": 2853, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:00.854654", + "step": 2853, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2965334951877594, + "timestamp": "2025-09-05 09:11:00.857254", + "step": 2854, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:01.111321", + "step": 2854, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5099512338638306, + "timestamp": "2025-09-05 09:11:01.114036", + "step": 2855, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:11:01.289857", + "step": 2855, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27999117970466614, + "timestamp": "2025-09-05 09:11:01.305114", + "step": 2856, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:01.460920", + "step": 2856, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4357217848300934, + "timestamp": "2025-09-05 09:11:01.463101", + "step": 2857, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:01.637596", + "step": 2857, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.304353266954422, + "timestamp": "2025-09-05 09:11:01.640459", + "step": 2858, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:01.867325", + "step": 2858, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3254320025444031, + "timestamp": "2025-09-05 09:11:01.870492", + "step": 2859, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:02.041130", + "step": 2859, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42343053221702576, + "timestamp": "2025-09-05 09:11:02.056588", + "step": 2860, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:07.149407", + "step": 2860, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.447537887231604, + "timestamp": "2025-09-05 09:11:07.153770", + "step": 2860, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:07.324510", + "step": 2860, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2908163368701935, + "timestamp": "2025-09-05 09:11:07.329103", + "step": 2861, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:07.535867", + "step": 2861, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.272657185792923, + "timestamp": "2025-09-05 09:11:07.538006", + "step": 2862, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:07.788214", + "step": 2862, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30033841729164124, + "timestamp": "2025-09-05 09:11:07.790608", + "step": 2863, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:07.999472", + "step": 2863, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3589075207710266, + "timestamp": "2025-09-05 09:11:08.008266", + "step": 2864, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:08.174062", + "step": 2864, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2896050810813904, + "timestamp": "2025-09-05 09:11:08.177026", + "step": 2865, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:08.385757", + "step": 2865, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.303393691778183, + "timestamp": "2025-09-05 09:11:08.390120", + "step": 2866, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:08.567458", + "step": 2866, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2631215453147888, + "timestamp": "2025-09-05 09:11:08.570872", + "step": 2867, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:08.732555", + "step": 2867, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17335376143455505, + "timestamp": "2025-09-05 09:11:08.749457", + "step": 2868, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:08.917535", + "step": 2868, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3204255998134613, + "timestamp": "2025-09-05 09:11:08.920552", + "step": 2869, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:09.083644", + "step": 2869, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2697032392024994, + "timestamp": "2025-09-05 09:11:09.117302", + "step": 2870, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:09.304974", + "step": 2870, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2992748022079468, + "timestamp": "2025-09-05 09:11:09.307880", + "step": 2871, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:09.489092", + "step": 2871, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4421190619468689, + "timestamp": "2025-09-05 09:11:09.503449", + "step": 2872, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:09.655522", + "step": 2872, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3196788728237152, + "timestamp": "2025-09-05 09:11:09.657440", + "step": 2873, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:09.845994", + "step": 2873, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35318523645401, + "timestamp": "2025-09-05 09:11:09.848550", + "step": 2874, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:10.074877", + "step": 2874, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2514253556728363, + "timestamp": "2025-09-05 09:11:10.078009", + "step": 2875, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:10.251551", + "step": 2875, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26804113388061523, + "timestamp": "2025-09-05 09:11:10.265438", + "step": 2876, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:10.417155", + "step": 2876, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4025658071041107, + "timestamp": "2025-09-05 09:11:10.419938", + "step": 2877, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:10.590848", + "step": 2877, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30375415086746216, + "timestamp": "2025-09-05 09:11:10.593530", + "step": 2878, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:10.848347", + "step": 2878, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20703478157520294, + "timestamp": "2025-09-05 09:11:10.850887", + "step": 2879, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:11.077416", + "step": 2879, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2653372287750244, + "timestamp": "2025-09-05 09:11:11.094236", + "step": 2880, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:16.273351", + "step": 2880, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.14329639107681, + "timestamp": "2025-09-05 09:11:16.275174", + "step": 2880, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2880", + "timestamp": "2025-09-05 09:11:16.727654", + "step": 2880, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:16.902534", + "step": 2880, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18782301247119904, + "timestamp": "2025-09-05 09:11:16.904887", + "step": 2881, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:17.108973", + "step": 2881, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.46404707431793213, + "timestamp": "2025-09-05 09:11:17.111598", + "step": 2882, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:17.330037", + "step": 2882, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23860648274421692, + "timestamp": "2025-09-05 09:11:17.332733", + "step": 2883, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:17.579843", + "step": 2883, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3546093702316284, + "timestamp": "2025-09-05 09:11:17.594163", + "step": 2884, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:17.783185", + "step": 2884, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2759442627429962, + "timestamp": "2025-09-05 09:11:17.785356", + "step": 2885, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:11:17.981116", + "step": 2885, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28804221749305725, + "timestamp": "2025-09-05 09:11:17.983364", + "step": 2886, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:18.178205", + "step": 2886, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2742845416069031, + "timestamp": "2025-09-05 09:11:18.180403", + "step": 2887, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:18.385459", + "step": 2887, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2764730155467987, + "timestamp": "2025-09-05 09:11:18.394721", + "step": 2888, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:18.557787", + "step": 2888, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29613184928894043, + "timestamp": "2025-09-05 09:11:18.559668", + "step": 2889, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:18.763335", + "step": 2889, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3713921010494232, + "timestamp": "2025-09-05 09:11:18.765453", + "step": 2890, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:18.969399", + "step": 2890, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3053815960884094, + "timestamp": "2025-09-05 09:11:18.971277", + "step": 2891, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:19.176772", + "step": 2891, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2894931137561798, + "timestamp": "2025-09-05 09:11:19.190110", + "step": 2892, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:11:19.381008", + "step": 2892, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.449165016412735, + "timestamp": "2025-09-05 09:11:19.383226", + "step": 2893, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:19.675218", + "step": 2893, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28056976199150085, + "timestamp": "2025-09-05 09:11:19.677221", + "step": 2894, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:19.884855", + "step": 2894, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40280672907829285, + "timestamp": "2025-09-05 09:11:19.887529", + "step": 2895, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:20.083466", + "step": 2895, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3817463219165802, + "timestamp": "2025-09-05 09:11:20.097033", + "step": 2896, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:20.289971", + "step": 2896, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38125133514404297, + "timestamp": "2025-09-05 09:11:20.292065", + "step": 2897, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:20.487917", + "step": 2897, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29178139567375183, + "timestamp": "2025-09-05 09:11:20.489961", + "step": 2898, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:20.694429", + "step": 2898, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41615185141563416, + "timestamp": "2025-09-05 09:11:20.696952", + "step": 2899, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:20.894823", + "step": 2899, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2605353593826294, + "timestamp": "2025-09-05 09:11:20.950986", + "step": 2900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:27.057789", + "step": 2900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.594667031207955, + "timestamp": "2025-09-05 09:11:27.061928", + "step": 2900, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:11:27.195690", + "step": 2900, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3752339482307434, + "timestamp": "2025-09-05 09:11:27.198217", + "step": 2901, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:27.340205", + "step": 2901, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2262498438358307, + "timestamp": "2025-09-05 09:11:27.342415", + "step": 2902, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:27.478265", + "step": 2902, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2441215217113495, + "timestamp": "2025-09-05 09:11:27.480989", + "step": 2903, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:27.661026", + "step": 2903, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32166409492492676, + "timestamp": "2025-09-05 09:11:27.670179", + "step": 2904, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:27.804903", + "step": 2904, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3819389045238495, + "timestamp": "2025-09-05 09:11:27.806864", + "step": 2905, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:11:27.941693", + "step": 2905, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3978331387042999, + "timestamp": "2025-09-05 09:11:27.944445", + "step": 2906, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:28.082659", + "step": 2906, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16381220519542694, + "timestamp": "2025-09-05 09:11:28.085360", + "step": 2907, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:28.311273", + "step": 2907, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29129958152770996, + "timestamp": "2025-09-05 09:11:28.327758", + "step": 2908, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:28.495832", + "step": 2908, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24345554411411285, + "timestamp": "2025-09-05 09:11:28.498309", + "step": 2909, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:28.659067", + "step": 2909, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3856002986431122, + "timestamp": "2025-09-05 09:11:28.661532", + "step": 2910, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:29.055825", + "step": 2910, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32962942123413086, + "timestamp": "2025-09-05 09:11:29.058096", + "step": 2911, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:11:29.295803", + "step": 2911, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4046390652656555, + "timestamp": "2025-09-05 09:11:29.311015", + "step": 2912, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:29.465817", + "step": 2912, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26469311118125916, + "timestamp": "2025-09-05 09:11:29.467655", + "step": 2913, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:29.635648", + "step": 2913, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2808852791786194, + "timestamp": "2025-09-05 09:11:29.637811", + "step": 2914, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:29.810312", + "step": 2914, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3875598907470703, + "timestamp": "2025-09-05 09:11:29.812426", + "step": 2915, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:29.974711", + "step": 2915, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23085609078407288, + "timestamp": "2025-09-05 09:11:29.991556", + "step": 2916, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:30.257525", + "step": 2916, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24958936870098114, + "timestamp": "2025-09-05 09:11:30.300724", + "step": 2917, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:30.744519", + "step": 2917, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1771358847618103, + "timestamp": "2025-09-05 09:11:30.774755", + "step": 2918, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:31.008046", + "step": 2918, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22553907334804535, + "timestamp": "2025-09-05 09:11:31.010166", + "step": 2919, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:31.185841", + "step": 2919, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24437619745731354, + "timestamp": "2025-09-05 09:11:31.203679", + "step": 2920, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:36.728102", + "step": 2920, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.64921801749989, + "timestamp": "2025-09-05 09:11:36.770957", + "step": 2920, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2920", + "timestamp": "2025-09-05 09:11:37.232785", + "step": 2920, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:37.417932", + "step": 2920, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21169906854629517, + "timestamp": "2025-09-05 09:11:37.420206", + "step": 2921, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:37.614738", + "step": 2921, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2853226363658905, + "timestamp": "2025-09-05 09:11:37.621981", + "step": 2922, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:37.875781", + "step": 2922, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43432602286338806, + "timestamp": "2025-09-05 09:11:37.878382", + "step": 2923, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:38.169912", + "step": 2923, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.340687096118927, + "timestamp": "2025-09-05 09:11:38.184822", + "step": 2924, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:38.404087", + "step": 2924, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23280911147594452, + "timestamp": "2025-09-05 09:11:38.423992", + "step": 2925, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:38.717427", + "step": 2925, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3444267511367798, + "timestamp": "2025-09-05 09:11:38.725155", + "step": 2926, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:11:39.030007", + "step": 2926, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.49055567383766174, + "timestamp": "2025-09-05 09:11:39.032125", + "step": 2927, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:39.236375", + "step": 2927, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.15985403954982758, + "timestamp": "2025-09-05 09:11:39.249699", + "step": 2928, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:39.494175", + "step": 2928, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3456610441207886, + "timestamp": "2025-09-05 09:11:39.537446", + "step": 2929, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:39.746492", + "step": 2929, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3735094368457794, + "timestamp": "2025-09-05 09:11:39.748659", + "step": 2930, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:39.951823", + "step": 2930, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24350151419639587, + "timestamp": "2025-09-05 09:11:39.954047", + "step": 2931, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:40.120027", + "step": 2931, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16938263177871704, + "timestamp": "2025-09-05 09:11:40.142112", + "step": 2932, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:40.415486", + "step": 2932, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2621157169342041, + "timestamp": "2025-09-05 09:11:40.417763", + "step": 2933, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:40.629949", + "step": 2933, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29170507192611694, + "timestamp": "2025-09-05 09:11:40.632243", + "step": 2934, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:40.840341", + "step": 2934, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20361952483654022, + "timestamp": "2025-09-05 09:11:40.842802", + "step": 2935, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:41.091216", + "step": 2935, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2855290174484253, + "timestamp": "2025-09-05 09:11:41.105181", + "step": 2936, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:41.293735", + "step": 2936, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2542724609375, + "timestamp": "2025-09-05 09:11:41.295789", + "step": 2937, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:11:41.488491", + "step": 2937, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21698574721813202, + "timestamp": "2025-09-05 09:11:41.490288", + "step": 2938, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:41.684731", + "step": 2938, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24226777255535126, + "timestamp": "2025-09-05 09:11:41.687400", + "step": 2939, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:41.979428", + "step": 2939, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24583564698696136, + "timestamp": "2025-09-05 09:11:41.992509", + "step": 2940, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:47.784860", + "step": 2940, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.78391043335974, + "timestamp": "2025-09-05 09:11:47.787728", + "step": 2940, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:48.031744", + "step": 2940, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2940543293952942, + "timestamp": "2025-09-05 09:11:48.050706", + "step": 2941, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:48.259904", + "step": 2941, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39327362179756165, + "timestamp": "2025-09-05 09:11:48.262429", + "step": 2942, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:48.460178", + "step": 2942, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38169828057289124, + "timestamp": "2025-09-05 09:11:48.462726", + "step": 2943, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:48.669720", + "step": 2943, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3624424338340759, + "timestamp": "2025-09-05 09:11:48.724431", + "step": 2944, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:48.968080", + "step": 2944, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1788206547498703, + "timestamp": "2025-09-05 09:11:49.012455", + "step": 2945, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:49.352156", + "step": 2945, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32311442494392395, + "timestamp": "2025-09-05 09:11:49.354956", + "step": 2946, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:49.559728", + "step": 2946, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.15347428619861603, + "timestamp": "2025-09-05 09:11:49.562279", + "step": 2947, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:49.767876", + "step": 2947, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3563332259654999, + "timestamp": "2025-09-05 09:11:49.825040", + "step": 2948, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:50.052862", + "step": 2948, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.13164092600345612, + "timestamp": "2025-09-05 09:11:50.055820", + "step": 2949, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:50.263077", + "step": 2949, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41619938611984253, + "timestamp": "2025-09-05 09:11:50.265649", + "step": 2950, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:50.462108", + "step": 2950, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31873831152915955, + "timestamp": "2025-09-05 09:11:50.500202", + "step": 2951, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:50.748488", + "step": 2951, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38107752799987793, + "timestamp": "2025-09-05 09:11:50.762883", + "step": 2952, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:50.951238", + "step": 2952, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2561323344707489, + "timestamp": "2025-09-05 09:11:50.953267", + "step": 2953, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:51.157835", + "step": 2953, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42956051230430603, + "timestamp": "2025-09-05 09:11:51.200100", + "step": 2954, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:51.449318", + "step": 2954, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.328776091337204, + "timestamp": "2025-09-05 09:11:51.451728", + "step": 2955, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:51.656524", + "step": 2955, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3320053815841675, + "timestamp": "2025-09-05 09:11:51.670659", + "step": 2956, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:11:51.863783", + "step": 2956, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42864733934402466, + "timestamp": "2025-09-05 09:11:51.866444", + "step": 2957, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:52.114254", + "step": 2957, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35642778873443604, + "timestamp": "2025-09-05 09:11:52.158212", + "step": 2958, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:52.458736", + "step": 2958, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.46565619111061096, + "timestamp": "2025-09-05 09:11:52.461733", + "step": 2959, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:52.667328", + "step": 2959, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.265375018119812, + "timestamp": "2025-09-05 09:11:52.682797", + "step": 2960, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:11:58.548505", + "step": 2960, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.55334624048589, + "timestamp": "2025-09-05 09:11:58.550509", + "step": 2960, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2960", + "timestamp": "2025-09-05 09:11:58.989021", + "step": 2960, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:11:59.158088", + "step": 2960, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38203999400138855, + "timestamp": "2025-09-05 09:11:59.160162", + "step": 2961, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:59.356216", + "step": 2961, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2547346353530884, + "timestamp": "2025-09-05 09:11:59.357933", + "step": 2962, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:11:59.552815", + "step": 2962, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27306175231933594, + "timestamp": "2025-09-05 09:11:59.554906", + "step": 2963, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:11:59.751367", + "step": 2963, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3331190347671509, + "timestamp": "2025-09-05 09:11:59.766971", + "step": 2964, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:11:59.963517", + "step": 2964, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.47358471155166626, + "timestamp": "2025-09-05 09:11:59.965670", + "step": 2965, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:00.174056", + "step": 2965, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2792886197566986, + "timestamp": "2025-09-05 09:12:00.175714", + "step": 2966, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:00.370898", + "step": 2966, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32101067900657654, + "timestamp": "2025-09-05 09:12:00.372845", + "step": 2967, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:00.538749", + "step": 2967, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27621304988861084, + "timestamp": "2025-09-05 09:12:00.552344", + "step": 2968, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:00.741327", + "step": 2968, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.415333092212677, + "timestamp": "2025-09-05 09:12:00.743833", + "step": 2969, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:00.950697", + "step": 2969, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26739320158958435, + "timestamp": "2025-09-05 09:12:00.952648", + "step": 2970, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:12:01.159894", + "step": 2970, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24618351459503174, + "timestamp": "2025-09-05 09:12:01.162045", + "step": 2971, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:01.324984", + "step": 2971, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20293676853179932, + "timestamp": "2025-09-05 09:12:01.342168", + "step": 2972, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:01.538887", + "step": 2972, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36944884061813354, + "timestamp": "2025-09-05 09:12:01.540964", + "step": 2973, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:01.738840", + "step": 2973, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28939300775527954, + "timestamp": "2025-09-05 09:12:01.743461", + "step": 2974, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:01.953118", + "step": 2974, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37455692887306213, + "timestamp": "2025-09-05 09:12:01.955446", + "step": 2975, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:02.165822", + "step": 2975, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3490968644618988, + "timestamp": "2025-09-05 09:12:02.179427", + "step": 2976, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:02.375029", + "step": 2976, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.375367134809494, + "timestamp": "2025-09-05 09:12:02.378208", + "step": 2977, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:02.574703", + "step": 2977, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2835385799407959, + "timestamp": "2025-09-05 09:12:02.576904", + "step": 2978, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:02.771920", + "step": 2978, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39227986335754395, + "timestamp": "2025-09-05 09:12:02.774121", + "step": 2979, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:02.968704", + "step": 2979, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29060855507850647, + "timestamp": "2025-09-05 09:12:02.985594", + "step": 2980, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:07.636923", + "step": 2980, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.606959405023254, + "timestamp": "2025-09-05 09:12:07.638898", + "step": 2980, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:07.798582", + "step": 2980, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20793351531028748, + "timestamp": "2025-09-05 09:12:07.800534", + "step": 2981, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:07.965219", + "step": 2981, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2619383931159973, + "timestamp": "2025-09-05 09:12:07.967179", + "step": 2982, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:08.171327", + "step": 2982, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27213922142982483, + "timestamp": "2025-09-05 09:12:08.173305", + "step": 2983, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:08.376566", + "step": 2983, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2601911425590515, + "timestamp": "2025-09-05 09:12:08.385366", + "step": 2984, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:08.547557", + "step": 2984, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2833835184574127, + "timestamp": "2025-09-05 09:12:08.549483", + "step": 2985, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:08.754222", + "step": 2985, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40964579582214355, + "timestamp": "2025-09-05 09:12:08.756656", + "step": 2986, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:08.961846", + "step": 2986, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41075843572616577, + "timestamp": "2025-09-05 09:12:08.964043", + "step": 2987, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:09.168674", + "step": 2987, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43984243273735046, + "timestamp": "2025-09-05 09:12:09.182401", + "step": 2988, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:09.368376", + "step": 2988, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40275534987449646, + "timestamp": "2025-09-05 09:12:09.370394", + "step": 2989, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:09.574501", + "step": 2989, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32596346735954285, + "timestamp": "2025-09-05 09:12:09.576459", + "step": 2990, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:09.741903", + "step": 2990, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3702400326728821, + "timestamp": "2025-09-05 09:12:09.744360", + "step": 2991, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:09.934911", + "step": 2991, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33719366788864136, + "timestamp": "2025-09-05 09:12:09.943923", + "step": 2992, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:10.102127", + "step": 2992, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2717023491859436, + "timestamp": "2025-09-05 09:12:10.104021", + "step": 2993, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:10.308830", + "step": 2993, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40143582224845886, + "timestamp": "2025-09-05 09:12:10.311152", + "step": 2994, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:10.503737", + "step": 2994, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2984316349029541, + "timestamp": "2025-09-05 09:12:10.510745", + "step": 2995, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:10.706382", + "step": 2995, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.14753331243991852, + "timestamp": "2025-09-05 09:12:10.715441", + "step": 2996, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:10.876908", + "step": 2996, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2177143543958664, + "timestamp": "2025-09-05 09:12:10.879118", + "step": 2997, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:11.082860", + "step": 2997, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39515912532806396, + "timestamp": "2025-09-05 09:12:11.085024", + "step": 2998, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:11.252542", + "step": 2998, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3414614498615265, + "timestamp": "2025-09-05 09:12:11.254655", + "step": 2999, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:11.448211", + "step": 2999, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34329718351364136, + "timestamp": "2025-09-05 09:12:11.462044", + "step": 3000, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:16.107356", + "step": 3000, + "epoch": 3 + }, + { + "type": "pplx", + "content": 53.10410466038564, + "timestamp": "2025-09-05 09:12:16.109410", + "step": 3000, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3000", + "timestamp": "2025-09-05 09:12:16.567964", + "step": 3000, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:16.746854", + "step": 3000, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5219593048095703, + "timestamp": "2025-09-05 09:12:16.748733", + "step": 3001, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:16.914361", + "step": 3001, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22384533286094666, + "timestamp": "2025-09-05 09:12:16.916455", + "step": 3002, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:17.118982", + "step": 3002, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35147175192832947, + "timestamp": "2025-09-05 09:12:17.121318", + "step": 3003, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:17.324431", + "step": 3003, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3589687943458557, + "timestamp": "2025-09-05 09:12:17.340452", + "step": 3004, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:17.535790", + "step": 3004, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27491340041160583, + "timestamp": "2025-09-05 09:12:17.538762", + "step": 3005, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:12:17.732860", + "step": 3005, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2931725084781647, + "timestamp": "2025-09-05 09:12:17.735179", + "step": 3006, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:17.937747", + "step": 3006, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.299150675535202, + "timestamp": "2025-09-05 09:12:17.940089", + "step": 3007, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:18.105046", + "step": 3007, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30979087948799133, + "timestamp": "2025-09-05 09:12:18.120936", + "step": 3008, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:18.317017", + "step": 3008, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26512518525123596, + "timestamp": "2025-09-05 09:12:18.319197", + "step": 3009, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:18.513289", + "step": 3009, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1896071434020996, + "timestamp": "2025-09-05 09:12:18.515493", + "step": 3010, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:18.711265", + "step": 3010, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3693452477455139, + "timestamp": "2025-09-05 09:12:18.713314", + "step": 3011, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:18.915519", + "step": 3011, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2476043701171875, + "timestamp": "2025-09-05 09:12:18.929455", + "step": 3012, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:19.117546", + "step": 3012, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3103981912136078, + "timestamp": "2025-09-05 09:12:19.119840", + "step": 3013, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:19.324252", + "step": 3013, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38308852910995483, + "timestamp": "2025-09-05 09:12:19.326376", + "step": 3014, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:19.530283", + "step": 3014, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27662599086761475, + "timestamp": "2025-09-05 09:12:19.532388", + "step": 3015, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:19.725363", + "step": 3015, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2989875376224518, + "timestamp": "2025-09-05 09:12:19.739697", + "step": 3016, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:19.924782", + "step": 3016, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2708384394645691, + "timestamp": "2025-09-05 09:12:19.927781", + "step": 3017, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:20.122071", + "step": 3017, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2433338612318039, + "timestamp": "2025-09-05 09:12:20.124302", + "step": 3018, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:20.288372", + "step": 3018, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.304372102022171, + "timestamp": "2025-09-05 09:12:20.291067", + "step": 3019, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:20.495596", + "step": 3019, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27465543150901794, + "timestamp": "2025-09-05 09:12:20.509765", + "step": 3020, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:25.167359", + "step": 3020, + "epoch": 3 + }, + { + "type": "pplx", + "content": 52.50252237851087, + "timestamp": "2025-09-05 09:12:25.169469", + "step": 3020, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:25.330830", + "step": 3020, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2500014305114746, + "timestamp": "2025-09-05 09:12:25.332814", + "step": 3021, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:25.496063", + "step": 3021, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2104569673538208, + "timestamp": "2025-09-05 09:12:25.498366", + "step": 3022, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:25.701436", + "step": 3022, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1959143728017807, + "timestamp": "2025-09-05 09:12:25.703511", + "step": 3023, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:25.909136", + "step": 3023, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3365468978881836, + "timestamp": "2025-09-05 09:12:25.923098", + "step": 3024, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:26.118433", + "step": 3024, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29884475469589233, + "timestamp": "2025-09-05 09:12:26.120908", + "step": 3025, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:26.316247", + "step": 3025, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2602010667324066, + "timestamp": "2025-09-05 09:12:26.318653", + "step": 3026, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:26.524920", + "step": 3026, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28113895654678345, + "timestamp": "2025-09-05 09:12:26.527074", + "step": 3027, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:12:26.722996", + "step": 3027, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34172090888023376, + "timestamp": "2025-09-05 09:12:26.741293", + "step": 3028, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:26.938961", + "step": 3028, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.401425838470459, + "timestamp": "2025-09-05 09:12:26.940826", + "step": 3029, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:27.146216", + "step": 3029, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2900570034980774, + "timestamp": "2025-09-05 09:12:27.148185", + "step": 3030, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:27.343391", + "step": 3030, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2659049928188324, + "timestamp": "2025-09-05 09:12:27.345656", + "step": 3031, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:27.550953", + "step": 3031, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30906596779823303, + "timestamp": "2025-09-05 09:12:27.567599", + "step": 3032, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:27.761515", + "step": 3032, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4450221657752991, + "timestamp": "2025-09-05 09:12:27.763636", + "step": 3033, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:27.959754", + "step": 3033, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35517576336860657, + "timestamp": "2025-09-05 09:12:27.962312", + "step": 3034, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:28.167097", + "step": 3034, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4547875225543976, + "timestamp": "2025-09-05 09:12:28.169352", + "step": 3035, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:28.364541", + "step": 3035, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2977902591228485, + "timestamp": "2025-09-05 09:12:28.381078", + "step": 3036, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:28.574247", + "step": 3036, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2584066092967987, + "timestamp": "2025-09-05 09:12:28.576226", + "step": 3037, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:28.778959", + "step": 3037, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22676676511764526, + "timestamp": "2025-09-05 09:12:28.781181", + "step": 3038, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:28.974249", + "step": 3038, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20184023678302765, + "timestamp": "2025-09-05 09:12:28.976433", + "step": 3039, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:29.179150", + "step": 3039, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3374442458152771, + "timestamp": "2025-09-05 09:12:29.195816", + "step": 3040, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:33.868272", + "step": 3040, + "epoch": 3 + }, + { + "type": "pplx", + "content": 52.40073254912315, + "timestamp": "2025-09-05 09:12:33.870120", + "step": 3040, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3040", + "timestamp": "2025-09-05 09:12:34.326846", + "step": 3040, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:34.467443", + "step": 3040, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4248269498348236, + "timestamp": "2025-09-05 09:12:34.469714", + "step": 3041, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:34.636452", + "step": 3041, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25288718938827515, + "timestamp": "2025-09-05 09:12:34.638663", + "step": 3042, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:34.809846", + "step": 3042, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31233614683151245, + "timestamp": "2025-09-05 09:12:34.811951", + "step": 3043, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:34.970532", + "step": 3043, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3236229419708252, + "timestamp": "2025-09-05 09:12:34.984081", + "step": 3044, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:35.136091", + "step": 3044, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19996929168701172, + "timestamp": "2025-09-05 09:12:35.138576", + "step": 3045, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:35.296364", + "step": 3045, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2629333436489105, + "timestamp": "2025-09-05 09:12:35.298689", + "step": 3046, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:35.468744", + "step": 3046, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20324602723121643, + "timestamp": "2025-09-05 09:12:35.470496", + "step": 3047, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:35.628681", + "step": 3047, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3019815981388092, + "timestamp": "2025-09-05 09:12:35.642804", + "step": 3048, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:35.797526", + "step": 3048, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2801806628704071, + "timestamp": "2025-09-05 09:12:35.800581", + "step": 3049, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:35.970991", + "step": 3049, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27385973930358887, + "timestamp": "2025-09-05 09:12:35.973122", + "step": 3050, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:36.136878", + "step": 3050, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30080705881118774, + "timestamp": "2025-09-05 09:12:36.139304", + "step": 3051, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:36.307841", + "step": 3051, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3155876398086548, + "timestamp": "2025-09-05 09:12:36.324192", + "step": 3052, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:36.485707", + "step": 3052, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3727303445339203, + "timestamp": "2025-09-05 09:12:36.488132", + "step": 3053, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:36.646680", + "step": 3053, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21063818037509918, + "timestamp": "2025-09-05 09:12:36.648771", + "step": 3054, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:36.806161", + "step": 3054, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37034881114959717, + "timestamp": "2025-09-05 09:12:36.808221", + "step": 3055, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:36.967297", + "step": 3055, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3373739719390869, + "timestamp": "2025-09-05 09:12:36.980504", + "step": 3056, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:37.134015", + "step": 3056, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2569211721420288, + "timestamp": "2025-09-05 09:12:37.136192", + "step": 3057, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:37.293082", + "step": 3057, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22303147614002228, + "timestamp": "2025-09-05 09:12:37.296720", + "step": 3058, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:37.472309", + "step": 3058, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2923431396484375, + "timestamp": "2025-09-05 09:12:37.474200", + "step": 3059, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:12:37.634249", + "step": 3059, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41064414381980896, + "timestamp": "2025-09-05 09:12:37.648170", + "step": 3060, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:42.310853", + "step": 3060, + "epoch": 3 + }, + { + "type": "pplx", + "content": 53.04668741895991, + "timestamp": "2025-09-05 09:12:42.313061", + "step": 3060, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:42.445731", + "step": 3060, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2520500719547272, + "timestamp": "2025-09-05 09:12:42.448010", + "step": 3061, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:42.584143", + "step": 3061, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33572205901145935, + "timestamp": "2025-09-05 09:12:42.588096", + "step": 3062, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:42.762657", + "step": 3062, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2813510298728943, + "timestamp": "2025-09-05 09:12:42.764778", + "step": 3063, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:42.925306", + "step": 3063, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23829667270183563, + "timestamp": "2025-09-05 09:12:42.941931", + "step": 3064, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:43.104263", + "step": 3064, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2956577241420746, + "timestamp": "2025-09-05 09:12:43.106274", + "step": 3065, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:43.266859", + "step": 3065, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24571780860424042, + "timestamp": "2025-09-05 09:12:43.269115", + "step": 3066, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:43.426658", + "step": 3066, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17806021869182587, + "timestamp": "2025-09-05 09:12:43.429191", + "step": 3067, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:43.600917", + "step": 3067, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2593998908996582, + "timestamp": "2025-09-05 09:12:43.609713", + "step": 3068, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:43.742291", + "step": 3068, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34801289439201355, + "timestamp": "2025-09-05 09:12:43.744763", + "step": 3069, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:43.912633", + "step": 3069, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5196738839149475, + "timestamp": "2025-09-05 09:12:43.914724", + "step": 3070, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:44.073133", + "step": 3070, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3646704852581024, + "timestamp": "2025-09-05 09:12:44.075393", + "step": 3071, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:44.248356", + "step": 3071, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3963429033756256, + "timestamp": "2025-09-05 09:12:44.265014", + "step": 3072, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:44.429735", + "step": 3072, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.265206903219223, + "timestamp": "2025-09-05 09:12:44.432399", + "step": 3073, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:44.596582", + "step": 3073, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26692867279052734, + "timestamp": "2025-09-05 09:12:44.599011", + "step": 3074, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:44.772730", + "step": 3074, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.45934414863586426, + "timestamp": "2025-09-05 09:12:44.774557", + "step": 3075, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:44.949906", + "step": 3075, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34467387199401855, + "timestamp": "2025-09-05 09:12:44.966102", + "step": 3076, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:45.159510", + "step": 3076, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2720514237880707, + "timestamp": "2025-09-05 09:12:45.161795", + "step": 3077, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:45.327367", + "step": 3077, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23640888929367065, + "timestamp": "2025-09-05 09:12:45.329368", + "step": 3078, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:45.492634", + "step": 3078, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.13110221922397614, + "timestamp": "2025-09-05 09:12:45.494825", + "step": 3079, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:45.659733", + "step": 3079, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3650048077106476, + "timestamp": "2025-09-05 09:12:45.675789", + "step": 3080, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:50.314454", + "step": 3080, + "epoch": 3 + }, + { + "type": "pplx", + "content": 53.94136359098753, + "timestamp": "2025-09-05 09:12:50.316945", + "step": 3080, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3080", + "timestamp": "2025-09-05 09:12:50.796727", + "step": 3080, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:50.935383", + "step": 3080, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2984461188316345, + "timestamp": "2025-09-05 09:12:50.937442", + "step": 3081, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:51.107896", + "step": 3081, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3399762213230133, + "timestamp": "2025-09-05 09:12:51.109895", + "step": 3082, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:51.274249", + "step": 3082, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19832485914230347, + "timestamp": "2025-09-05 09:12:51.276230", + "step": 3083, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:51.440549", + "step": 3083, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3151915669441223, + "timestamp": "2025-09-05 09:12:51.454599", + "step": 3084, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:51.617929", + "step": 3084, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27868831157684326, + "timestamp": "2025-09-05 09:12:51.620026", + "step": 3085, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:51.786145", + "step": 3085, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3083263635635376, + "timestamp": "2025-09-05 09:12:51.788444", + "step": 3086, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:51.962175", + "step": 3086, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23392392694950104, + "timestamp": "2025-09-05 09:12:51.964192", + "step": 3087, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:52.128880", + "step": 3087, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36243969202041626, + "timestamp": "2025-09-05 09:12:52.142581", + "step": 3088, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:12:52.301060", + "step": 3088, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36660972237586975, + "timestamp": "2025-09-05 09:12:52.303374", + "step": 3089, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:52.467650", + "step": 3089, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1570238471031189, + "timestamp": "2025-09-05 09:12:52.469835", + "step": 3090, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:52.643306", + "step": 3090, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22328078746795654, + "timestamp": "2025-09-05 09:12:52.645564", + "step": 3091, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:52.810208", + "step": 3091, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34495118260383606, + "timestamp": "2025-09-05 09:12:52.827558", + "step": 3092, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:52.985898", + "step": 3092, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3345975875854492, + "timestamp": "2025-09-05 09:12:52.988073", + "step": 3093, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:53.151108", + "step": 3093, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31810086965560913, + "timestamp": "2025-09-05 09:12:53.154398", + "step": 3094, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:53.325100", + "step": 3094, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3396746814250946, + "timestamp": "2025-09-05 09:12:53.327879", + "step": 3095, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:53.492335", + "step": 3095, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.319447785615921, + "timestamp": "2025-09-05 09:12:53.508844", + "step": 3096, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:53.675150", + "step": 3096, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.422185480594635, + "timestamp": "2025-09-05 09:12:53.677142", + "step": 3097, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:12:53.845714", + "step": 3097, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24050793051719666, + "timestamp": "2025-09-05 09:12:53.848556", + "step": 3098, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:54.014849", + "step": 3098, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3138499855995178, + "timestamp": "2025-09-05 09:12:54.017092", + "step": 3099, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:12:54.180414", + "step": 3099, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20346899330615997, + "timestamp": "2025-09-05 09:12:54.194225", + "step": 3100, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:12:58.862549", + "step": 3100, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.427679569639615, + "timestamp": "2025-09-05 09:12:58.865548", + "step": 3100, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:59.033815", + "step": 3100, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30215319991111755, + "timestamp": "2025-09-05 09:12:59.035886", + "step": 3101, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:59.201630", + "step": 3101, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26254820823669434, + "timestamp": "2025-09-05 09:12:59.203802", + "step": 3102, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:12:59.409539", + "step": 3102, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32716652750968933, + "timestamp": "2025-09-05 09:12:59.412056", + "step": 3103, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:12:59.604460", + "step": 3103, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3003425896167755, + "timestamp": "2025-09-05 09:12:59.618822", + "step": 3104, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:12:59.806993", + "step": 3104, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2939733564853668, + "timestamp": "2025-09-05 09:12:59.809008", + "step": 3105, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:00.002760", + "step": 3105, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25660908222198486, + "timestamp": "2025-09-05 09:13:00.004380", + "step": 3106, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:00.208171", + "step": 3106, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23837247490882874, + "timestamp": "2025-09-05 09:13:00.209793", + "step": 3107, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:00.372238", + "step": 3107, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34475165605545044, + "timestamp": "2025-09-05 09:13:00.388481", + "step": 3108, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:00.580255", + "step": 3108, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22767935693264008, + "timestamp": "2025-09-05 09:13:00.581944", + "step": 3109, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:00.782539", + "step": 3109, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28474533557891846, + "timestamp": "2025-09-05 09:13:00.784441", + "step": 3110, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:00.949594", + "step": 3110, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19329321384429932, + "timestamp": "2025-09-05 09:13:00.951484", + "step": 3111, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:01.145321", + "step": 3111, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35781329870224, + "timestamp": "2025-09-05 09:13:01.158501", + "step": 3112, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:01.345180", + "step": 3112, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3530445694923401, + "timestamp": "2025-09-05 09:13:01.346934", + "step": 3113, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:01.540655", + "step": 3113, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40410447120666504, + "timestamp": "2025-09-05 09:13:01.542300", + "step": 3114, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:01.736198", + "step": 3114, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4058263301849365, + "timestamp": "2025-09-05 09:13:01.737932", + "step": 3115, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:01.930582", + "step": 3115, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27423328161239624, + "timestamp": "2025-09-05 09:13:01.944971", + "step": 3116, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:02.141696", + "step": 3116, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2607077956199646, + "timestamp": "2025-09-05 09:13:02.143699", + "step": 3117, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:02.340557", + "step": 3117, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42114734649658203, + "timestamp": "2025-09-05 09:13:02.342579", + "step": 3118, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:02.546126", + "step": 3118, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3532010316848755, + "timestamp": "2025-09-05 09:13:02.548110", + "step": 3119, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:02.742858", + "step": 3119, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23121574521064758, + "timestamp": "2025-09-05 09:13:02.758242", + "step": 3120, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:07.392319", + "step": 3120, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.011493179080475, + "timestamp": "2025-09-05 09:13:07.394549", + "step": 3120, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3120", + "timestamp": "2025-09-05 09:13:07.852680", + "step": 3120, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:08.021571", + "step": 3120, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3971679210662842, + "timestamp": "2025-09-05 09:13:08.023605", + "step": 3121, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:08.218220", + "step": 3121, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3160933256149292, + "timestamp": "2025-09-05 09:13:08.220278", + "step": 3122, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:08.423966", + "step": 3122, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31142422556877136, + "timestamp": "2025-09-05 09:13:08.426571", + "step": 3123, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:08.621738", + "step": 3123, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32644516229629517, + "timestamp": "2025-09-05 09:13:08.637658", + "step": 3124, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:08.833274", + "step": 3124, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30763083696365356, + "timestamp": "2025-09-05 09:13:08.835053", + "step": 3125, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:09.040750", + "step": 3125, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26925909519195557, + "timestamp": "2025-09-05 09:13:09.042723", + "step": 3126, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:09.237489", + "step": 3126, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3116042912006378, + "timestamp": "2025-09-05 09:13:09.239615", + "step": 3127, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:09.404887", + "step": 3127, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21474801003932953, + "timestamp": "2025-09-05 09:13:09.421229", + "step": 3128, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:09.619863", + "step": 3128, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26150596141815186, + "timestamp": "2025-09-05 09:13:09.622785", + "step": 3129, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:09.820954", + "step": 3129, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1961638182401657, + "timestamp": "2025-09-05 09:13:09.823856", + "step": 3130, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:10.020164", + "step": 3130, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40828442573547363, + "timestamp": "2025-09-05 09:13:10.022062", + "step": 3131, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:10.218948", + "step": 3131, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3171522915363312, + "timestamp": "2025-09-05 09:13:10.232774", + "step": 3132, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:10.419755", + "step": 3132, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32482245564460754, + "timestamp": "2025-09-05 09:13:10.421934", + "step": 3133, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:10.615055", + "step": 3133, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3538453280925751, + "timestamp": "2025-09-05 09:13:10.617063", + "step": 3134, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:10.815295", + "step": 3134, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24779179692268372, + "timestamp": "2025-09-05 09:13:10.817327", + "step": 3135, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:11.022062", + "step": 3135, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32560235261917114, + "timestamp": "2025-09-05 09:13:11.035980", + "step": 3136, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:11.226163", + "step": 3136, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21415184438228607, + "timestamp": "2025-09-05 09:13:11.228008", + "step": 3137, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:13:11.424295", + "step": 3137, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4218257665634155, + "timestamp": "2025-09-05 09:13:11.426517", + "step": 3138, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:11.622235", + "step": 3138, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23505598306655884, + "timestamp": "2025-09-05 09:13:11.624260", + "step": 3139, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:11.826337", + "step": 3139, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23702280223369598, + "timestamp": "2025-09-05 09:13:11.842648", + "step": 3140, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:16.493846", + "step": 3140, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.51095658927002, + "timestamp": "2025-09-05 09:13:16.496136", + "step": 3140, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:16.640097", + "step": 3140, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3143397867679596, + "timestamp": "2025-09-05 09:13:16.642303", + "step": 3141, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:16.807476", + "step": 3141, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3324390649795532, + "timestamp": "2025-09-05 09:13:16.809313", + "step": 3142, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:16.974136", + "step": 3142, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33971095085144043, + "timestamp": "2025-09-05 09:13:16.975975", + "step": 3143, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:17.141360", + "step": 3143, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19881880283355713, + "timestamp": "2025-09-05 09:13:17.159763", + "step": 3144, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:17.353448", + "step": 3144, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38393595814704895, + "timestamp": "2025-09-05 09:13:17.356113", + "step": 3145, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:17.534073", + "step": 3145, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2562781870365143, + "timestamp": "2025-09-05 09:13:17.535976", + "step": 3146, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:17.721573", + "step": 3146, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29539769887924194, + "timestamp": "2025-09-05 09:13:17.724247", + "step": 3147, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:17.893316", + "step": 3147, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3817562162876129, + "timestamp": "2025-09-05 09:13:17.907641", + "step": 3148, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:18.072599", + "step": 3148, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3453420102596283, + "timestamp": "2025-09-05 09:13:18.074806", + "step": 3149, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:18.210735", + "step": 3149, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4222257137298584, + "timestamp": "2025-09-05 09:13:18.212520", + "step": 3150, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:18.347726", + "step": 3150, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3828117847442627, + "timestamp": "2025-09-05 09:13:18.349613", + "step": 3151, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:18.522127", + "step": 3151, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17302794754505157, + "timestamp": "2025-09-05 09:13:18.531119", + "step": 3152, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:18.663926", + "step": 3152, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18933707475662231, + "timestamp": "2025-09-05 09:13:18.665871", + "step": 3153, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:13:18.799778", + "step": 3153, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.15503232181072235, + "timestamp": "2025-09-05 09:13:18.801802", + "step": 3154, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:18.982215", + "step": 3154, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23069196939468384, + "timestamp": "2025-09-05 09:13:18.984252", + "step": 3155, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:19.148184", + "step": 3155, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2175375521183014, + "timestamp": "2025-09-05 09:13:19.162405", + "step": 3156, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:19.327253", + "step": 3156, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34681564569473267, + "timestamp": "2025-09-05 09:13:19.329228", + "step": 3157, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:19.465782", + "step": 3157, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21456918120384216, + "timestamp": "2025-09-05 09:13:19.468315", + "step": 3158, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:19.632293", + "step": 3158, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2504569888114929, + "timestamp": "2025-09-05 09:13:19.634342", + "step": 3159, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:19.797236", + "step": 3159, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2558198571205139, + "timestamp": "2025-09-05 09:13:19.811715", + "step": 3160, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:24.479387", + "step": 3160, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.59432534042321, + "timestamp": "2025-09-05 09:13:24.481354", + "step": 3160, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3160", + "timestamp": "2025-09-05 09:13:24.938644", + "step": 3160, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:25.078249", + "step": 3160, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22591379284858704, + "timestamp": "2025-09-05 09:13:25.080397", + "step": 3161, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:25.235416", + "step": 3161, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3258766233921051, + "timestamp": "2025-09-05 09:13:25.237445", + "step": 3162, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:25.393153", + "step": 3162, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20219656825065613, + "timestamp": "2025-09-05 09:13:25.395273", + "step": 3163, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:25.565963", + "step": 3163, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28959089517593384, + "timestamp": "2025-09-05 09:13:25.580379", + "step": 3164, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:25.734032", + "step": 3164, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4348662495613098, + "timestamp": "2025-09-05 09:13:25.736065", + "step": 3165, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:25.893585", + "step": 3165, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28287631273269653, + "timestamp": "2025-09-05 09:13:25.895614", + "step": 3166, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:26.053273", + "step": 3166, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24401001632213593, + "timestamp": "2025-09-05 09:13:26.055349", + "step": 3167, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:26.215424", + "step": 3167, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20977631211280823, + "timestamp": "2025-09-05 09:13:26.229460", + "step": 3168, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:26.381101", + "step": 3168, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2639927268028259, + "timestamp": "2025-09-05 09:13:26.383275", + "step": 3169, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:26.540967", + "step": 3169, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22925163805484772, + "timestamp": "2025-09-05 09:13:26.543319", + "step": 3170, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:26.700237", + "step": 3170, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2756401300430298, + "timestamp": "2025-09-05 09:13:26.702363", + "step": 3171, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:26.871120", + "step": 3171, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2692306935787201, + "timestamp": "2025-09-05 09:13:26.885314", + "step": 3172, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:27.045316", + "step": 3172, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29711222648620605, + "timestamp": "2025-09-05 09:13:27.047747", + "step": 3173, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:27.216850", + "step": 3173, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23208114504814148, + "timestamp": "2025-09-05 09:13:27.218956", + "step": 3174, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:27.377079", + "step": 3174, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38599449396133423, + "timestamp": "2025-09-05 09:13:27.379006", + "step": 3175, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:27.515279", + "step": 3175, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39971405267715454, + "timestamp": "2025-09-05 09:13:27.531379", + "step": 3176, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:27.691399", + "step": 3176, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19964629411697388, + "timestamp": "2025-09-05 09:13:27.694855", + "step": 3177, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:27.854835", + "step": 3177, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2849213778972626, + "timestamp": "2025-09-05 09:13:27.857053", + "step": 3178, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:28.014115", + "step": 3178, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3031027317047119, + "timestamp": "2025-09-05 09:13:28.016142", + "step": 3179, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:28.174919", + "step": 3179, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28292617201805115, + "timestamp": "2025-09-05 09:13:28.188940", + "step": 3180, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:32.818971", + "step": 3180, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.892830971609754, + "timestamp": "2025-09-05 09:13:32.820927", + "step": 3180, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:32.988141", + "step": 3180, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3160610496997833, + "timestamp": "2025-09-05 09:13:32.989996", + "step": 3181, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:33.192947", + "step": 3181, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39101895689964294, + "timestamp": "2025-09-05 09:13:33.194897", + "step": 3182, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:33.391519", + "step": 3182, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31756341457366943, + "timestamp": "2025-09-05 09:13:33.393262", + "step": 3183, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:33.598904", + "step": 3183, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.38891616463661194, + "timestamp": "2025-09-05 09:13:33.612305", + "step": 3184, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:33.808124", + "step": 3184, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23683956265449524, + "timestamp": "2025-09-05 09:13:33.810090", + "step": 3185, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:34.005063", + "step": 3185, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2844909727573395, + "timestamp": "2025-09-05 09:13:34.007312", + "step": 3186, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:34.202313", + "step": 3186, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3205225467681885, + "timestamp": "2025-09-05 09:13:34.204137", + "step": 3187, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:34.406967", + "step": 3187, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2209499329328537, + "timestamp": "2025-09-05 09:13:34.422971", + "step": 3188, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:34.619317", + "step": 3188, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2793022096157074, + "timestamp": "2025-09-05 09:13:34.621318", + "step": 3189, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:34.817222", + "step": 3189, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2650418281555176, + "timestamp": "2025-09-05 09:13:34.819152", + "step": 3190, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:35.014115", + "step": 3190, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3632805347442627, + "timestamp": "2025-09-05 09:13:35.016034", + "step": 3191, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:35.210886", + "step": 3191, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1768227517604828, + "timestamp": "2025-09-05 09:13:35.224422", + "step": 3192, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:35.412596", + "step": 3192, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3283153176307678, + "timestamp": "2025-09-05 09:13:35.414386", + "step": 3193, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:35.618536", + "step": 3193, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2565184235572815, + "timestamp": "2025-09-05 09:13:35.620431", + "step": 3194, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:35.816892", + "step": 3194, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21749939024448395, + "timestamp": "2025-09-05 09:13:35.819231", + "step": 3195, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:36.022908", + "step": 3195, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20354753732681274, + "timestamp": "2025-09-05 09:13:36.037137", + "step": 3196, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:36.225881", + "step": 3196, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3543854355812073, + "timestamp": "2025-09-05 09:13:36.227551", + "step": 3197, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:36.422313", + "step": 3197, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3425697088241577, + "timestamp": "2025-09-05 09:13:36.424066", + "step": 3198, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:36.629872", + "step": 3198, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21106812357902527, + "timestamp": "2025-09-05 09:13:36.632198", + "step": 3199, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:36.837408", + "step": 3199, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23504656553268433, + "timestamp": "2025-09-05 09:13:36.853502", + "step": 3200, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:41.533859", + "step": 3200, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.82782568641178, + "timestamp": "2025-09-05 09:13:41.536162", + "step": 3200, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3200", + "timestamp": "2025-09-05 09:13:42.000609", + "step": 3200, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:42.185163", + "step": 3200, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2856009304523468, + "timestamp": "2025-09-05 09:13:42.188048", + "step": 3201, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:42.383871", + "step": 3201, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21880973875522614, + "timestamp": "2025-09-05 09:13:42.385968", + "step": 3202, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:42.584596", + "step": 3202, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3296593725681305, + "timestamp": "2025-09-05 09:13:42.586488", + "step": 3203, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:42.783648", + "step": 3203, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4413667321205139, + "timestamp": "2025-09-05 09:13:42.799481", + "step": 3204, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:13:42.996163", + "step": 3204, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25287240743637085, + "timestamp": "2025-09-05 09:13:42.998732", + "step": 3205, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:43.193562", + "step": 3205, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1394006609916687, + "timestamp": "2025-09-05 09:13:43.195855", + "step": 3206, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:43.392977", + "step": 3206, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3348866403102875, + "timestamp": "2025-09-05 09:13:43.395309", + "step": 3207, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:43.589743", + "step": 3207, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4713085889816284, + "timestamp": "2025-09-05 09:13:43.603613", + "step": 3208, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:43.795151", + "step": 3208, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3775988817214966, + "timestamp": "2025-09-05 09:13:43.797101", + "step": 3209, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:43.993547", + "step": 3209, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3147152066230774, + "timestamp": "2025-09-05 09:13:43.995458", + "step": 3210, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:44.199575", + "step": 3210, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3415358364582062, + "timestamp": "2025-09-05 09:13:44.201472", + "step": 3211, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:44.400141", + "step": 3211, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19705742597579956, + "timestamp": "2025-09-05 09:13:44.414020", + "step": 3212, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:44.603265", + "step": 3212, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4473814368247986, + "timestamp": "2025-09-05 09:13:44.605283", + "step": 3213, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:44.800855", + "step": 3213, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34939709305763245, + "timestamp": "2025-09-05 09:13:44.802749", + "step": 3214, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:44.998852", + "step": 3214, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39002692699432373, + "timestamp": "2025-09-05 09:13:45.000884", + "step": 3215, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:45.197315", + "step": 3215, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20467509329319, + "timestamp": "2025-09-05 09:13:45.213118", + "step": 3216, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:45.407524", + "step": 3216, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3030194044113159, + "timestamp": "2025-09-05 09:13:45.409435", + "step": 3217, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:45.604678", + "step": 3217, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27976304292678833, + "timestamp": "2025-09-05 09:13:45.606660", + "step": 3218, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:45.801975", + "step": 3218, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19097599387168884, + "timestamp": "2025-09-05 09:13:45.804011", + "step": 3219, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:45.999009", + "step": 3219, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.314406156539917, + "timestamp": "2025-09-05 09:13:46.013392", + "step": 3220, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:50.632439", + "step": 3220, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.24416855105137, + "timestamp": "2025-09-05 09:13:50.634131", + "step": 3220, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:50.794930", + "step": 3220, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.303940087556839, + "timestamp": "2025-09-05 09:13:50.798304", + "step": 3221, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:51.000115", + "step": 3221, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24126599729061127, + "timestamp": "2025-09-05 09:13:51.002152", + "step": 3222, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:51.166682", + "step": 3222, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2524254322052002, + "timestamp": "2025-09-05 09:13:51.168699", + "step": 3223, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:51.366062", + "step": 3223, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3770129382610321, + "timestamp": "2025-09-05 09:13:51.380480", + "step": 3224, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:51.568788", + "step": 3224, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3300042450428009, + "timestamp": "2025-09-05 09:13:51.570467", + "step": 3225, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:13:51.774578", + "step": 3225, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3154258728027344, + "timestamp": "2025-09-05 09:13:51.776918", + "step": 3226, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:13:51.980358", + "step": 3226, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.14694538712501526, + "timestamp": "2025-09-05 09:13:51.982512", + "step": 3227, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:52.180855", + "step": 3227, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23194490373134613, + "timestamp": "2025-09-05 09:13:52.195702", + "step": 3228, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:52.384044", + "step": 3228, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34397757053375244, + "timestamp": "2025-09-05 09:13:52.386127", + "step": 3229, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:52.582752", + "step": 3229, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2911645472049713, + "timestamp": "2025-09-05 09:13:52.584730", + "step": 3230, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:52.779959", + "step": 3230, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4919881224632263, + "timestamp": "2025-09-05 09:13:52.782040", + "step": 3231, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:52.976859", + "step": 3231, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2355922907590866, + "timestamp": "2025-09-05 09:13:52.992677", + "step": 3232, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:13:53.187410", + "step": 3232, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36438658833503723, + "timestamp": "2025-09-05 09:13:53.189659", + "step": 3233, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:53.395388", + "step": 3233, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18304933607578278, + "timestamp": "2025-09-05 09:13:53.398335", + "step": 3234, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:53.597527", + "step": 3234, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27435052394866943, + "timestamp": "2025-09-05 09:13:53.599413", + "step": 3235, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:53.804343", + "step": 3235, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3234035074710846, + "timestamp": "2025-09-05 09:13:53.818643", + "step": 3236, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:54.009349", + "step": 3236, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44214802980422974, + "timestamp": "2025-09-05 09:13:54.011547", + "step": 3237, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:13:54.215298", + "step": 3237, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19979646801948547, + "timestamp": "2025-09-05 09:13:54.217753", + "step": 3238, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:54.422847", + "step": 3238, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24443750083446503, + "timestamp": "2025-09-05 09:13:54.424945", + "step": 3239, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:13:54.628788", + "step": 3239, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26840123534202576, + "timestamp": "2025-09-05 09:13:54.641842", + "step": 3240, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:13:59.277876", + "step": 3240, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.90161940514098, + "timestamp": "2025-09-05 09:13:59.280406", + "step": 3240, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3240", + "timestamp": "2025-09-05 09:13:59.748306", + "step": 3240, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:13:59.918039", + "step": 3240, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4399571120738983, + "timestamp": "2025-09-05 09:13:59.920498", + "step": 3241, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:00.124459", + "step": 3241, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26113492250442505, + "timestamp": "2025-09-05 09:14:00.126551", + "step": 3242, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:00.322918", + "step": 3242, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.255985289812088, + "timestamp": "2025-09-05 09:14:00.326082", + "step": 3243, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:00.530352", + "step": 3243, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34716635942459106, + "timestamp": "2025-09-05 09:14:00.544313", + "step": 3244, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:00.732575", + "step": 3244, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24577327072620392, + "timestamp": "2025-09-05 09:14:00.735048", + "step": 3245, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:00.929746", + "step": 3245, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3293505907058716, + "timestamp": "2025-09-05 09:14:00.931782", + "step": 3246, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:01.127220", + "step": 3246, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3006509840488434, + "timestamp": "2025-09-05 09:14:01.129383", + "step": 3247, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:01.321083", + "step": 3247, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29109349846839905, + "timestamp": "2025-09-05 09:14:01.334922", + "step": 3248, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:01.531797", + "step": 3248, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19694913923740387, + "timestamp": "2025-09-05 09:14:01.534299", + "step": 3249, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:01.733757", + "step": 3249, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3834435045719147, + "timestamp": "2025-09-05 09:14:01.735505", + "step": 3250, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:01.938130", + "step": 3250, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23048020899295807, + "timestamp": "2025-09-05 09:14:01.940204", + "step": 3251, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:02.135458", + "step": 3251, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3646623492240906, + "timestamp": "2025-09-05 09:14:02.150277", + "step": 3252, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:14:02.339019", + "step": 3252, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3779314160346985, + "timestamp": "2025-09-05 09:14:02.341680", + "step": 3253, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:02.537969", + "step": 3253, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22319437563419342, + "timestamp": "2025-09-05 09:14:02.540092", + "step": 3254, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:02.737611", + "step": 3254, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3502451479434967, + "timestamp": "2025-09-05 09:14:02.739738", + "step": 3255, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:02.934475", + "step": 3255, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3203504681587219, + "timestamp": "2025-09-05 09:14:02.950514", + "step": 3256, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:03.146960", + "step": 3256, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2472524791955948, + "timestamp": "2025-09-05 09:14:03.149188", + "step": 3257, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:03.344244", + "step": 3257, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2531106770038605, + "timestamp": "2025-09-05 09:14:03.346032", + "step": 3258, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:03.552245", + "step": 3258, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4314362108707428, + "timestamp": "2025-09-05 09:14:03.554754", + "step": 3259, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:03.752049", + "step": 3259, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2161693572998047, + "timestamp": "2025-09-05 09:14:03.765613", + "step": 3260, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:08.486975", + "step": 3260, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.99661515636932, + "timestamp": "2025-09-05 09:14:08.493687", + "step": 3260, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:08.628509", + "step": 3260, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23275181651115417, + "timestamp": "2025-09-05 09:14:08.630855", + "step": 3261, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:08.793165", + "step": 3261, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4570353329181671, + "timestamp": "2025-09-05 09:14:08.797479", + "step": 3262, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:08.967300", + "step": 3262, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2490944266319275, + "timestamp": "2025-09-05 09:14:08.976549", + "step": 3263, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:09.140541", + "step": 3263, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28627151250839233, + "timestamp": "2025-09-05 09:14:09.160239", + "step": 3264, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:14:09.335688", + "step": 3264, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3446066677570343, + "timestamp": "2025-09-05 09:14:09.338447", + "step": 3265, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:09.519039", + "step": 3265, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2337915450334549, + "timestamp": "2025-09-05 09:14:09.528948", + "step": 3266, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:09.752462", + "step": 3266, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2857224941253662, + "timestamp": "2025-09-05 09:14:09.756325", + "step": 3267, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:09.992777", + "step": 3267, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.306959867477417, + "timestamp": "2025-09-05 09:14:10.021273", + "step": 3268, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:10.249649", + "step": 3268, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2844340205192566, + "timestamp": "2025-09-05 09:14:10.252493", + "step": 3269, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:10.449235", + "step": 3269, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23272141814231873, + "timestamp": "2025-09-05 09:14:10.452990", + "step": 3270, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:10.647682", + "step": 3270, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2352897822856903, + "timestamp": "2025-09-05 09:14:10.650540", + "step": 3271, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:10.842725", + "step": 3271, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23539894819259644, + "timestamp": "2025-09-05 09:14:10.855848", + "step": 3272, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:11.043914", + "step": 3272, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27581384778022766, + "timestamp": "2025-09-05 09:14:11.049234", + "step": 3273, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:11.253580", + "step": 3273, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28692498803138733, + "timestamp": "2025-09-05 09:14:11.255575", + "step": 3274, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:11.459964", + "step": 3274, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2923082411289215, + "timestamp": "2025-09-05 09:14:11.464675", + "step": 3275, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:11.663448", + "step": 3275, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19943730533123016, + "timestamp": "2025-09-05 09:14:11.678177", + "step": 3276, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:11.867747", + "step": 3276, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33723267912864685, + "timestamp": "2025-09-05 09:14:11.869804", + "step": 3277, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:12.066298", + "step": 3277, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28404057025909424, + "timestamp": "2025-09-05 09:14:12.068253", + "step": 3278, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:12.270783", + "step": 3278, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4263806939125061, + "timestamp": "2025-09-05 09:14:12.273001", + "step": 3279, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:14:12.466169", + "step": 3279, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26628220081329346, + "timestamp": "2025-09-05 09:14:12.480160", + "step": 3280, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:17.120270", + "step": 3280, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.134940070215634, + "timestamp": "2025-09-05 09:14:17.122321", + "step": 3280, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3280", + "timestamp": "2025-09-05 09:14:17.606762", + "step": 3280, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:17.775913", + "step": 3280, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25822189450263977, + "timestamp": "2025-09-05 09:14:17.778034", + "step": 3281, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:17.972040", + "step": 3281, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3605766296386719, + "timestamp": "2025-09-05 09:14:17.974036", + "step": 3282, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:18.168615", + "step": 3282, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16432644426822662, + "timestamp": "2025-09-05 09:14:18.170672", + "step": 3283, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:18.337349", + "step": 3283, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36961498856544495, + "timestamp": "2025-09-05 09:14:18.353700", + "step": 3284, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:14:18.550693", + "step": 3284, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.314778596162796, + "timestamp": "2025-09-05 09:14:18.552728", + "step": 3285, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:18.747988", + "step": 3285, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2686954736709595, + "timestamp": "2025-09-05 09:14:18.750449", + "step": 3286, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:14:18.914285", + "step": 3286, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21251261234283447, + "timestamp": "2025-09-05 09:14:18.916301", + "step": 3287, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:19.116973", + "step": 3287, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16383251547813416, + "timestamp": "2025-09-05 09:14:19.126361", + "step": 3288, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:19.286046", + "step": 3288, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2723452150821686, + "timestamp": "2025-09-05 09:14:19.288450", + "step": 3289, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:19.494176", + "step": 3289, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19908295571804047, + "timestamp": "2025-09-05 09:14:19.495960", + "step": 3290, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:19.699764", + "step": 3290, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3640027940273285, + "timestamp": "2025-09-05 09:14:19.701833", + "step": 3291, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:19.905047", + "step": 3291, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29716038703918457, + "timestamp": "2025-09-05 09:14:19.921246", + "step": 3292, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:20.117897", + "step": 3292, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23596565425395966, + "timestamp": "2025-09-05 09:14:20.122259", + "step": 3293, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:14:20.328057", + "step": 3293, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3065091371536255, + "timestamp": "2025-09-05 09:14:20.330125", + "step": 3294, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:20.525257", + "step": 3294, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29849687218666077, + "timestamp": "2025-09-05 09:14:20.528075", + "step": 3295, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:20.734713", + "step": 3295, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2795962393283844, + "timestamp": "2025-09-05 09:14:20.748329", + "step": 3296, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:20.936480", + "step": 3296, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32312214374542236, + "timestamp": "2025-09-05 09:14:20.938447", + "step": 3297, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:21.134729", + "step": 3297, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.310803085565567, + "timestamp": "2025-09-05 09:14:21.136747", + "step": 3298, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:21.332732", + "step": 3298, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26474887132644653, + "timestamp": "2025-09-05 09:14:21.335441", + "step": 3299, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:21.501888", + "step": 3299, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25056520104408264, + "timestamp": "2025-09-05 09:14:21.518681", + "step": 3300, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:26.147488", + "step": 3300, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.88412892104688, + "timestamp": "2025-09-05 09:14:26.149332", + "step": 3300, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:26.309641", + "step": 3300, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24012216925621033, + "timestamp": "2025-09-05 09:14:26.311702", + "step": 3301, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:26.477497", + "step": 3301, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3001227378845215, + "timestamp": "2025-09-05 09:14:26.479473", + "step": 3302, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:26.684596", + "step": 3302, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39674386382102966, + "timestamp": "2025-09-05 09:14:26.686953", + "step": 3303, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:26.882026", + "step": 3303, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22072692215442657, + "timestamp": "2025-09-05 09:14:26.895795", + "step": 3304, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:27.083043", + "step": 3304, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2948031723499298, + "timestamp": "2025-09-05 09:14:27.084945", + "step": 3305, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:27.279441", + "step": 3305, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28666290640830994, + "timestamp": "2025-09-05 09:14:27.281505", + "step": 3306, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:27.476574", + "step": 3306, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3318125903606415, + "timestamp": "2025-09-05 09:14:27.478516", + "step": 3307, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:27.673538", + "step": 3307, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20074622333049774, + "timestamp": "2025-09-05 09:14:27.687577", + "step": 3308, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:27.875149", + "step": 3308, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34110990166664124, + "timestamp": "2025-09-05 09:14:27.877311", + "step": 3309, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:28.073214", + "step": 3309, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31203046441078186, + "timestamp": "2025-09-05 09:14:28.075308", + "step": 3310, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:14:28.270514", + "step": 3310, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3481146991252899, + "timestamp": "2025-09-05 09:14:28.272785", + "step": 3311, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:28.467367", + "step": 3311, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2804660201072693, + "timestamp": "2025-09-05 09:14:28.481098", + "step": 3312, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:28.669353", + "step": 3312, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30905982851982117, + "timestamp": "2025-09-05 09:14:28.671337", + "step": 3313, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:28.875523", + "step": 3313, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3836857080459595, + "timestamp": "2025-09-05 09:14:28.877566", + "step": 3314, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:29.072635", + "step": 3314, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20405101776123047, + "timestamp": "2025-09-05 09:14:29.075571", + "step": 3315, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:29.271910", + "step": 3315, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2559118866920471, + "timestamp": "2025-09-05 09:14:29.285986", + "step": 3316, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:29.473237", + "step": 3316, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24376524984836578, + "timestamp": "2025-09-05 09:14:29.474965", + "step": 3317, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:29.679559", + "step": 3317, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36361056566238403, + "timestamp": "2025-09-05 09:14:29.681590", + "step": 3318, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:29.877224", + "step": 3318, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21259449422359467, + "timestamp": "2025-09-05 09:14:29.879560", + "step": 3319, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:30.084812", + "step": 3319, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.13015785813331604, + "timestamp": "2025-09-05 09:14:30.097888", + "step": 3320, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:34.729703", + "step": 3320, + "epoch": 3 + }, + { + "type": "pplx", + "content": 58.29189660131222, + "timestamp": "2025-09-05 09:14:34.731755", + "step": 3320, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3320", + "timestamp": "2025-09-05 09:14:35.184580", + "step": 3320, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:35.352584", + "step": 3320, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24831537902355194, + "timestamp": "2025-09-05 09:14:35.354845", + "step": 3321, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:35.521604", + "step": 3321, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3621161878108978, + "timestamp": "2025-09-05 09:14:35.523647", + "step": 3322, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:35.719190", + "step": 3322, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39665305614471436, + "timestamp": "2025-09-05 09:14:35.720906", + "step": 3323, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:35.914640", + "step": 3323, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2833714187145233, + "timestamp": "2025-09-05 09:14:35.927820", + "step": 3324, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:36.115552", + "step": 3324, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24043862521648407, + "timestamp": "2025-09-05 09:14:36.117529", + "step": 3325, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:36.322088", + "step": 3325, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30250465869903564, + "timestamp": "2025-09-05 09:14:36.324460", + "step": 3326, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:36.521003", + "step": 3326, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2354232370853424, + "timestamp": "2025-09-05 09:14:36.523015", + "step": 3327, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:36.716681", + "step": 3327, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2318793684244156, + "timestamp": "2025-09-05 09:14:36.730420", + "step": 3328, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:36.918065", + "step": 3328, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40657877922058105, + "timestamp": "2025-09-05 09:14:36.920782", + "step": 3329, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:37.116457", + "step": 3329, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33501723408699036, + "timestamp": "2025-09-05 09:14:37.122192", + "step": 3330, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:37.288149", + "step": 3330, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3885265290737152, + "timestamp": "2025-09-05 09:14:37.290154", + "step": 3331, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:37.484428", + "step": 3331, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2885796129703522, + "timestamp": "2025-09-05 09:14:37.498324", + "step": 3332, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:37.693667", + "step": 3332, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20705479383468628, + "timestamp": "2025-09-05 09:14:37.695563", + "step": 3333, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:37.891306", + "step": 3333, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35460859537124634, + "timestamp": "2025-09-05 09:14:37.893113", + "step": 3334, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:14:38.056266", + "step": 3334, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2885071039199829, + "timestamp": "2025-09-05 09:14:38.058140", + "step": 3335, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:38.262325", + "step": 3335, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31416335701942444, + "timestamp": "2025-09-05 09:14:38.276155", + "step": 3336, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:38.465017", + "step": 3336, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3768198788166046, + "timestamp": "2025-09-05 09:14:38.467262", + "step": 3337, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:38.662768", + "step": 3337, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20448677241802216, + "timestamp": "2025-09-05 09:14:38.665904", + "step": 3338, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:38.867273", + "step": 3338, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18872617185115814, + "timestamp": "2025-09-05 09:14:38.869675", + "step": 3339, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:39.064688", + "step": 3339, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2098010927438736, + "timestamp": "2025-09-05 09:14:39.077793", + "step": 3340, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:43.699963", + "step": 3340, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.79616724128643, + "timestamp": "2025-09-05 09:14:43.702238", + "step": 3340, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:43.862535", + "step": 3340, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1867901235818863, + "timestamp": "2025-09-05 09:14:43.864350", + "step": 3341, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:44.027936", + "step": 3341, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.221108078956604, + "timestamp": "2025-09-05 09:14:44.030146", + "step": 3342, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:44.231856", + "step": 3342, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34810617566108704, + "timestamp": "2025-09-05 09:14:44.234406", + "step": 3343, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:44.431426", + "step": 3343, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2502887547016144, + "timestamp": "2025-09-05 09:14:44.445148", + "step": 3344, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:14:44.633890", + "step": 3344, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29671764373779297, + "timestamp": "2025-09-05 09:14:44.635930", + "step": 3345, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:44.827997", + "step": 3345, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37294748425483704, + "timestamp": "2025-09-05 09:14:44.831072", + "step": 3346, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:45.025566", + "step": 3346, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1889505237340927, + "timestamp": "2025-09-05 09:14:45.028729", + "step": 3347, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:45.221524", + "step": 3347, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3558424711227417, + "timestamp": "2025-09-05 09:14:45.236034", + "step": 3348, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:45.418501", + "step": 3348, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.334084689617157, + "timestamp": "2025-09-05 09:14:45.421506", + "step": 3349, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:45.616166", + "step": 3349, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3016354739665985, + "timestamp": "2025-09-05 09:14:45.618253", + "step": 3350, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:45.812162", + "step": 3350, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32526397705078125, + "timestamp": "2025-09-05 09:14:45.814521", + "step": 3351, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:46.010569", + "step": 3351, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3452492952346802, + "timestamp": "2025-09-05 09:14:46.025210", + "step": 3352, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:46.214549", + "step": 3352, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2737857401371002, + "timestamp": "2025-09-05 09:14:46.216639", + "step": 3353, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:46.421245", + "step": 3353, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28672438859939575, + "timestamp": "2025-09-05 09:14:46.423862", + "step": 3354, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:46.619848", + "step": 3354, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2686102092266083, + "timestamp": "2025-09-05 09:14:46.622311", + "step": 3355, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:46.817721", + "step": 3355, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5274847745895386, + "timestamp": "2025-09-05 09:14:46.831979", + "step": 3356, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:47.020822", + "step": 3356, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3943261206150055, + "timestamp": "2025-09-05 09:14:47.022852", + "step": 3357, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:47.226539", + "step": 3357, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29574286937713623, + "timestamp": "2025-09-05 09:14:47.228426", + "step": 3358, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:47.423390", + "step": 3358, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22139273583889008, + "timestamp": "2025-09-05 09:14:47.425194", + "step": 3359, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:47.630502", + "step": 3359, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22906312346458435, + "timestamp": "2025-09-05 09:14:47.643775", + "step": 3360, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:14:52.285789", + "step": 3360, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.80259232058196, + "timestamp": "2025-09-05 09:14:52.287807", + "step": 3360, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3360", + "timestamp": "2025-09-05 09:14:52.754322", + "step": 3360, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:52.922250", + "step": 3360, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2129780501127243, + "timestamp": "2025-09-05 09:14:52.924346", + "step": 3361, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:53.126998", + "step": 3361, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2788529396057129, + "timestamp": "2025-09-05 09:14:53.129039", + "step": 3362, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:53.323035", + "step": 3362, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1642555296421051, + "timestamp": "2025-09-05 09:14:53.325772", + "step": 3363, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:53.529352", + "step": 3363, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2505939304828644, + "timestamp": "2025-09-05 09:14:53.544030", + "step": 3364, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:53.733373", + "step": 3364, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3860357403755188, + "timestamp": "2025-09-05 09:14:53.735306", + "step": 3365, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:53.902689", + "step": 3365, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2575131058692932, + "timestamp": "2025-09-05 09:14:53.904636", + "step": 3366, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:54.111034", + "step": 3366, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2468288540840149, + "timestamp": "2025-09-05 09:14:54.113207", + "step": 3367, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:54.280129", + "step": 3367, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21068817377090454, + "timestamp": "2025-09-05 09:14:54.296230", + "step": 3368, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:54.490215", + "step": 3368, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3015161454677582, + "timestamp": "2025-09-05 09:14:54.492249", + "step": 3369, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:54.695993", + "step": 3369, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32386377453804016, + "timestamp": "2025-09-05 09:14:54.698406", + "step": 3370, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:14:54.905151", + "step": 3370, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3242306709289551, + "timestamp": "2025-09-05 09:14:54.910037", + "step": 3371, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:55.127944", + "step": 3371, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3697621524333954, + "timestamp": "2025-09-05 09:14:55.142607", + "step": 3372, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:14:55.330924", + "step": 3372, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2763964831829071, + "timestamp": "2025-09-05 09:14:55.332840", + "step": 3373, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:55.536582", + "step": 3373, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.237064391374588, + "timestamp": "2025-09-05 09:14:55.538791", + "step": 3374, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:55.744391", + "step": 3374, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3390370309352875, + "timestamp": "2025-09-05 09:14:55.746411", + "step": 3375, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:55.941059", + "step": 3375, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3362955152988434, + "timestamp": "2025-09-05 09:14:55.954866", + "step": 3376, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:14:56.144253", + "step": 3376, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3102271854877472, + "timestamp": "2025-09-05 09:14:56.146687", + "step": 3377, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:14:56.349031", + "step": 3377, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2332654595375061, + "timestamp": "2025-09-05 09:14:56.351094", + "step": 3378, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:14:56.546133", + "step": 3378, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.297242134809494, + "timestamp": "2025-09-05 09:14:56.548284", + "step": 3379, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:14:56.743311", + "step": 3379, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3286650478839874, + "timestamp": "2025-09-05 09:14:56.756242", + "step": 3380, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:01.458280", + "step": 3380, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.62159304092415, + "timestamp": "2025-09-05 09:15:01.460429", + "step": 3380, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:01.620078", + "step": 3380, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35747459530830383, + "timestamp": "2025-09-05 09:15:01.622312", + "step": 3381, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:01.788171", + "step": 3381, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21469105780124664, + "timestamp": "2025-09-05 09:15:01.789982", + "step": 3382, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:01.964218", + "step": 3382, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23298673331737518, + "timestamp": "2025-09-05 09:15:01.966641", + "step": 3383, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:02.175542", + "step": 3383, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20766644179821014, + "timestamp": "2025-09-05 09:15:02.191896", + "step": 3384, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:02.389069", + "step": 3384, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3102535009384155, + "timestamp": "2025-09-05 09:15:02.391536", + "step": 3385, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:02.588812", + "step": 3385, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23322685062885284, + "timestamp": "2025-09-05 09:15:02.591193", + "step": 3386, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:02.787088", + "step": 3386, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2468855381011963, + "timestamp": "2025-09-05 09:15:02.789001", + "step": 3387, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:02.984473", + "step": 3387, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34562644362449646, + "timestamp": "2025-09-05 09:15:02.998435", + "step": 3388, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:03.188465", + "step": 3388, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27134451270103455, + "timestamp": "2025-09-05 09:15:03.190918", + "step": 3389, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:03.395100", + "step": 3389, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28415605425834656, + "timestamp": "2025-09-05 09:15:03.397109", + "step": 3390, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:03.561477", + "step": 3390, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24175278842449188, + "timestamp": "2025-09-05 09:15:03.563253", + "step": 3391, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:03.768969", + "step": 3391, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2102387249469757, + "timestamp": "2025-09-05 09:15:03.786745", + "step": 3392, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:03.983685", + "step": 3392, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32016080617904663, + "timestamp": "2025-09-05 09:15:03.986311", + "step": 3393, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:04.181398", + "step": 3393, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20062977075576782, + "timestamp": "2025-09-05 09:15:04.183780", + "step": 3394, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:04.387838", + "step": 3394, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3682517409324646, + "timestamp": "2025-09-05 09:15:04.389851", + "step": 3395, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:04.553368", + "step": 3395, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.341085284948349, + "timestamp": "2025-09-05 09:15:04.569588", + "step": 3396, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:04.765544", + "step": 3396, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32158076763153076, + "timestamp": "2025-09-05 09:15:04.767464", + "step": 3397, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:04.932611", + "step": 3397, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25980469584465027, + "timestamp": "2025-09-05 09:15:04.934471", + "step": 3398, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:05.139083", + "step": 3398, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2603665292263031, + "timestamp": "2025-09-05 09:15:05.140946", + "step": 3399, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:05.305222", + "step": 3399, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3023158311843872, + "timestamp": "2025-09-05 09:15:05.324842", + "step": 3400, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:10.151502", + "step": 3400, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.15140966805781, + "timestamp": "2025-09-05 09:15:10.153894", + "step": 3400, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3400", + "timestamp": "2025-09-05 09:15:10.623749", + "step": 3400, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:10.790882", + "step": 3400, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2876795530319214, + "timestamp": "2025-09-05 09:15:10.793809", + "step": 3401, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:10.959766", + "step": 3401, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20607417821884155, + "timestamp": "2025-09-05 09:15:10.962061", + "step": 3402, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:11.168326", + "step": 3402, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2529613673686981, + "timestamp": "2025-09-05 09:15:11.170402", + "step": 3403, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:11.366376", + "step": 3403, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2774178385734558, + "timestamp": "2025-09-05 09:15:11.380522", + "step": 3404, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:11.570006", + "step": 3404, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23143890500068665, + "timestamp": "2025-09-05 09:15:11.572036", + "step": 3405, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:11.767214", + "step": 3405, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2894435226917267, + "timestamp": "2025-09-05 09:15:11.769237", + "step": 3406, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:11.934854", + "step": 3406, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28832849860191345, + "timestamp": "2025-09-05 09:15:11.937495", + "step": 3407, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:12.141320", + "step": 3407, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3540599048137665, + "timestamp": "2025-09-05 09:15:12.157930", + "step": 3408, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:12.356207", + "step": 3408, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4962592124938965, + "timestamp": "2025-09-05 09:15:12.359073", + "step": 3409, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:12.562077", + "step": 3409, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3610974848270416, + "timestamp": "2025-09-05 09:15:12.564088", + "step": 3410, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:12.760083", + "step": 3410, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.238719180226326, + "timestamp": "2025-09-05 09:15:12.762436", + "step": 3411, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:12.958223", + "step": 3411, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3288671374320984, + "timestamp": "2025-09-05 09:15:12.972265", + "step": 3412, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:13.169674", + "step": 3412, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32551729679107666, + "timestamp": "2025-09-05 09:15:13.172045", + "step": 3413, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:13.378475", + "step": 3413, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24862384796142578, + "timestamp": "2025-09-05 09:15:13.380797", + "step": 3414, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:13.576666", + "step": 3414, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28981438279151917, + "timestamp": "2025-09-05 09:15:13.578607", + "step": 3415, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:13.783393", + "step": 3415, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2465643584728241, + "timestamp": "2025-09-05 09:15:13.797643", + "step": 3416, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:13.990010", + "step": 3416, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36424243450164795, + "timestamp": "2025-09-05 09:15:13.992100", + "step": 3417, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:14.188052", + "step": 3417, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2606428861618042, + "timestamp": "2025-09-05 09:15:14.190080", + "step": 3418, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:14.355441", + "step": 3418, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.335843950510025, + "timestamp": "2025-09-05 09:15:14.357952", + "step": 3419, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:14.563346", + "step": 3419, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3763747215270996, + "timestamp": "2025-09-05 09:15:14.577993", + "step": 3420, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:19.236496", + "step": 3420, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.271628643405734, + "timestamp": "2025-09-05 09:15:19.238464", + "step": 3420, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:19.398491", + "step": 3420, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26849332451820374, + "timestamp": "2025-09-05 09:15:19.400593", + "step": 3421, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:19.565877", + "step": 3421, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2555030584335327, + "timestamp": "2025-09-05 09:15:19.568109", + "step": 3422, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:19.770532", + "step": 3422, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3213971257209778, + "timestamp": "2025-09-05 09:15:19.772369", + "step": 3423, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:19.968537", + "step": 3423, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2824600338935852, + "timestamp": "2025-09-05 09:15:19.982086", + "step": 3424, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:20.169982", + "step": 3424, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2855144739151001, + "timestamp": "2025-09-05 09:15:20.172138", + "step": 3425, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:20.366577", + "step": 3425, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3020437955856323, + "timestamp": "2025-09-05 09:15:20.368634", + "step": 3426, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:20.564048", + "step": 3426, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26419728994369507, + "timestamp": "2025-09-05 09:15:20.566245", + "step": 3427, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:20.762084", + "step": 3427, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3464629054069519, + "timestamp": "2025-09-05 09:15:20.776139", + "step": 3428, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:20.971258", + "step": 3428, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2719259560108185, + "timestamp": "2025-09-05 09:15:20.973178", + "step": 3429, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:21.177875", + "step": 3429, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29615768790245056, + "timestamp": "2025-09-05 09:15:21.179698", + "step": 3430, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:21.374975", + "step": 3430, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1992407739162445, + "timestamp": "2025-09-05 09:15:21.377643", + "step": 3431, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:21.573803", + "step": 3431, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23724502325057983, + "timestamp": "2025-09-05 09:15:21.587778", + "step": 3432, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:21.774715", + "step": 3432, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3083495795726776, + "timestamp": "2025-09-05 09:15:21.776739", + "step": 3433, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:21.980420", + "step": 3433, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3451620638370514, + "timestamp": "2025-09-05 09:15:21.982583", + "step": 3434, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:22.179465", + "step": 3434, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3891802728176117, + "timestamp": "2025-09-05 09:15:22.181453", + "step": 3435, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:22.385405", + "step": 3435, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3739110231399536, + "timestamp": "2025-09-05 09:15:22.399825", + "step": 3436, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:22.595466", + "step": 3436, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2214849293231964, + "timestamp": "2025-09-05 09:15:22.600093", + "step": 3437, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:22.767477", + "step": 3437, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36019426584243774, + "timestamp": "2025-09-05 09:15:22.775452", + "step": 3438, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:22.982396", + "step": 3438, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1959642767906189, + "timestamp": "2025-09-05 09:15:22.984052", + "step": 3439, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:23.180164", + "step": 3439, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30671948194503784, + "timestamp": "2025-09-05 09:15:23.194420", + "step": 3440, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:27.842056", + "step": 3440, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.487056186244445, + "timestamp": "2025-09-05 09:15:27.844216", + "step": 3440, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3440", + "timestamp": "2025-09-05 09:15:28.308530", + "step": 3440, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:28.469196", + "step": 3440, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4308766722679138, + "timestamp": "2025-09-05 09:15:28.471395", + "step": 3441, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:28.672848", + "step": 3441, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2569759488105774, + "timestamp": "2025-09-05 09:15:28.675252", + "step": 3442, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:28.871485", + "step": 3442, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.39308369159698486, + "timestamp": "2025-09-05 09:15:28.873593", + "step": 3443, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:29.067035", + "step": 3443, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36607518792152405, + "timestamp": "2025-09-05 09:15:29.080686", + "step": 3444, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:29.266327", + "step": 3444, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3157901167869568, + "timestamp": "2025-09-05 09:15:29.268664", + "step": 3445, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:29.463369", + "step": 3445, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32073408365249634, + "timestamp": "2025-09-05 09:15:29.465382", + "step": 3446, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:29.668771", + "step": 3446, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21521615982055664, + "timestamp": "2025-09-05 09:15:29.670848", + "step": 3447, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:29.873604", + "step": 3447, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24196557700634003, + "timestamp": "2025-09-05 09:15:29.887959", + "step": 3448, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:30.082517", + "step": 3448, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34385359287261963, + "timestamp": "2025-09-05 09:15:30.084676", + "step": 3449, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:30.279879", + "step": 3449, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2406640201807022, + "timestamp": "2025-09-05 09:15:30.282237", + "step": 3450, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:30.475842", + "step": 3450, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18529509007930756, + "timestamp": "2025-09-05 09:15:30.477976", + "step": 3451, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:30.672260", + "step": 3451, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3604893088340759, + "timestamp": "2025-09-05 09:15:30.688470", + "step": 3452, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:30.883472", + "step": 3452, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3433663249015808, + "timestamp": "2025-09-05 09:15:30.885759", + "step": 3453, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:31.079225", + "step": 3453, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2779248058795929, + "timestamp": "2025-09-05 09:15:31.082324", + "step": 3454, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:31.275564", + "step": 3454, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31973740458488464, + "timestamp": "2025-09-05 09:15:31.277984", + "step": 3455, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:31.471886", + "step": 3455, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.349806547164917, + "timestamp": "2025-09-05 09:15:31.486066", + "step": 3456, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:31.676963", + "step": 3456, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1699691265821457, + "timestamp": "2025-09-05 09:15:31.678853", + "step": 3457, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:31.871846", + "step": 3457, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4177989065647125, + "timestamp": "2025-09-05 09:15:31.883300", + "step": 3458, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:32.078448", + "step": 3458, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24496307969093323, + "timestamp": "2025-09-05 09:15:32.080765", + "step": 3459, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:32.284260", + "step": 3459, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24864445626735687, + "timestamp": "2025-09-05 09:15:32.298058", + "step": 3460, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:36.949048", + "step": 3460, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.84112110557716, + "timestamp": "2025-09-05 09:15:36.951078", + "step": 3460, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:37.108299", + "step": 3460, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33684948086738586, + "timestamp": "2025-09-05 09:15:37.111450", + "step": 3461, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:37.276196", + "step": 3461, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.371894896030426, + "timestamp": "2025-09-05 09:15:37.278182", + "step": 3462, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:37.481793", + "step": 3462, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41939064860343933, + "timestamp": "2025-09-05 09:15:37.483876", + "step": 3463, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:37.678809", + "step": 3463, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16646873950958252, + "timestamp": "2025-09-05 09:15:37.692365", + "step": 3464, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:37.887252", + "step": 3464, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27085840702056885, + "timestamp": "2025-09-05 09:15:37.889497", + "step": 3465, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:38.085222", + "step": 3465, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3408336639404297, + "timestamp": "2025-09-05 09:15:38.087272", + "step": 3466, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:38.288038", + "step": 3466, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23875372111797333, + "timestamp": "2025-09-05 09:15:38.290372", + "step": 3467, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:38.484252", + "step": 3467, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28153443336486816, + "timestamp": "2025-09-05 09:15:38.498227", + "step": 3468, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:38.686591", + "step": 3468, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3948226869106293, + "timestamp": "2025-09-05 09:15:38.689339", + "step": 3469, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:38.883108", + "step": 3469, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.253915011882782, + "timestamp": "2025-09-05 09:15:38.885040", + "step": 3470, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:39.079939", + "step": 3470, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2193460613489151, + "timestamp": "2025-09-05 09:15:39.081811", + "step": 3471, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:39.276867", + "step": 3471, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19324208796024323, + "timestamp": "2025-09-05 09:15:39.291080", + "step": 3472, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:39.478061", + "step": 3472, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3039948344230652, + "timestamp": "2025-09-05 09:15:39.479952", + "step": 3473, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:39.675488", + "step": 3473, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36532217264175415, + "timestamp": "2025-09-05 09:15:39.677479", + "step": 3474, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:15:39.878087", + "step": 3474, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2959711253643036, + "timestamp": "2025-09-05 09:15:39.879967", + "step": 3475, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:40.075382", + "step": 3475, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44074317812919617, + "timestamp": "2025-09-05 09:15:40.091771", + "step": 3476, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:40.286274", + "step": 3476, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2943912446498871, + "timestamp": "2025-09-05 09:15:40.288213", + "step": 3477, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:40.490989", + "step": 3477, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21967966854572296, + "timestamp": "2025-09-05 09:15:40.492987", + "step": 3478, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:40.688577", + "step": 3478, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1738353669643402, + "timestamp": "2025-09-05 09:15:40.691098", + "step": 3479, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:40.894679", + "step": 3479, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3198263347148895, + "timestamp": "2025-09-05 09:15:40.908795", + "step": 3480, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:45.541745", + "step": 3480, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.259662380369484, + "timestamp": "2025-09-05 09:15:45.543529", + "step": 3480, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3480", + "timestamp": "2025-09-05 09:15:45.993651", + "step": 3480, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:46.154598", + "step": 3480, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30058109760284424, + "timestamp": "2025-09-05 09:15:46.156858", + "step": 3481, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:46.349969", + "step": 3481, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.325216144323349, + "timestamp": "2025-09-05 09:15:46.351846", + "step": 3482, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:46.554582", + "step": 3482, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2780161499977112, + "timestamp": "2025-09-05 09:15:46.556875", + "step": 3483, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:46.760064", + "step": 3483, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1740078330039978, + "timestamp": "2025-09-05 09:15:46.768946", + "step": 3484, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:46.930918", + "step": 3484, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3463882803916931, + "timestamp": "2025-09-05 09:15:46.933011", + "step": 3485, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:47.137903", + "step": 3485, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1923578530550003, + "timestamp": "2025-09-05 09:15:47.140077", + "step": 3486, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:47.334418", + "step": 3486, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27616503834724426, + "timestamp": "2025-09-05 09:15:47.336586", + "step": 3487, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:47.541527", + "step": 3487, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.475725382566452, + "timestamp": "2025-09-05 09:15:47.550637", + "step": 3488, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:47.714230", + "step": 3488, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.42054125666618347, + "timestamp": "2025-09-05 09:15:47.716194", + "step": 3489, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:47.920059", + "step": 3489, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2236691415309906, + "timestamp": "2025-09-05 09:15:47.922360", + "step": 3490, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:48.117564", + "step": 3490, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21372617781162262, + "timestamp": "2025-09-05 09:15:48.119899", + "step": 3491, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:48.282364", + "step": 3491, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3831466734409332, + "timestamp": "2025-09-05 09:15:48.296563", + "step": 3492, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:48.480614", + "step": 3492, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4005196690559387, + "timestamp": "2025-09-05 09:15:48.482752", + "step": 3493, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:48.687366", + "step": 3493, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3020406663417816, + "timestamp": "2025-09-05 09:15:48.689928", + "step": 3494, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:48.882552", + "step": 3494, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33710694313049316, + "timestamp": "2025-09-05 09:15:48.884841", + "step": 3495, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:49.077262", + "step": 3495, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41189056634902954, + "timestamp": "2025-09-05 09:15:49.093452", + "step": 3496, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:49.285982", + "step": 3496, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3529796004295349, + "timestamp": "2025-09-05 09:15:49.288044", + "step": 3497, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:49.481074", + "step": 3497, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41452711820602417, + "timestamp": "2025-09-05 09:15:49.483836", + "step": 3498, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:49.685250", + "step": 3498, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23813626170158386, + "timestamp": "2025-09-05 09:15:49.688159", + "step": 3499, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:49.890585", + "step": 3499, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2140645980834961, + "timestamp": "2025-09-05 09:15:49.904427", + "step": 3500, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:15:54.532826", + "step": 3500, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.432321365212985, + "timestamp": "2025-09-05 09:15:54.535264", + "step": 3500, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:54.697969", + "step": 3500, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3301979899406433, + "timestamp": "2025-09-05 09:15:54.700718", + "step": 3501, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:54.868212", + "step": 3501, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25775179266929626, + "timestamp": "2025-09-05 09:15:54.870710", + "step": 3502, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:55.074541", + "step": 3502, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2093990296125412, + "timestamp": "2025-09-05 09:15:55.076740", + "step": 3503, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:55.270867", + "step": 3503, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23625101149082184, + "timestamp": "2025-09-05 09:15:55.286009", + "step": 3504, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:55.473909", + "step": 3504, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3592059016227722, + "timestamp": "2025-09-05 09:15:55.475922", + "step": 3505, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:55.679420", + "step": 3505, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2650335431098938, + "timestamp": "2025-09-05 09:15:55.681455", + "step": 3506, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:55.885416", + "step": 3506, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25605595111846924, + "timestamp": "2025-09-05 09:15:55.887447", + "step": 3507, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:56.082118", + "step": 3507, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2315322905778885, + "timestamp": "2025-09-05 09:15:56.098407", + "step": 3508, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:15:56.294219", + "step": 3508, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3174898028373718, + "timestamp": "2025-09-05 09:15:56.296213", + "step": 3509, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:56.499496", + "step": 3509, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3008250296115875, + "timestamp": "2025-09-05 09:15:56.501873", + "step": 3510, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:56.704909", + "step": 3510, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33064156770706177, + "timestamp": "2025-09-05 09:15:56.707016", + "step": 3511, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:15:56.902443", + "step": 3511, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2688453793525696, + "timestamp": "2025-09-05 09:15:56.911436", + "step": 3512, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:57.074020", + "step": 3512, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23117870092391968, + "timestamp": "2025-09-05 09:15:57.075969", + "step": 3513, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:15:57.280000", + "step": 3513, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17968448996543884, + "timestamp": "2025-09-05 09:15:57.281980", + "step": 3514, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:57.479408", + "step": 3514, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25697532296180725, + "timestamp": "2025-09-05 09:15:57.481853", + "step": 3515, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:57.685913", + "step": 3515, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29816049337387085, + "timestamp": "2025-09-05 09:15:57.698968", + "step": 3516, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:57.885343", + "step": 3516, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23597969114780426, + "timestamp": "2025-09-05 09:15:57.887857", + "step": 3517, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:15:58.093491", + "step": 3517, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26065388321876526, + "timestamp": "2025-09-05 09:15:58.095763", + "step": 3518, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:15:58.291219", + "step": 3518, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24879339337348938, + "timestamp": "2025-09-05 09:15:58.293254", + "step": 3519, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:15:58.489325", + "step": 3519, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2509574294090271, + "timestamp": "2025-09-05 09:15:58.502474", + "step": 3520, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:03.161672", + "step": 3520, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.248311386178514, + "timestamp": "2025-09-05 09:16:03.164871", + "step": 3520, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3520", + "timestamp": "2025-09-05 09:16:03.773257", + "step": 3520, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:03.938424", + "step": 3520, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24730339646339417, + "timestamp": "2025-09-05 09:16:03.940606", + "step": 3521, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:04.135104", + "step": 3521, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3603556752204895, + "timestamp": "2025-09-05 09:16:04.137174", + "step": 3522, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:04.332922", + "step": 3522, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27035149931907654, + "timestamp": "2025-09-05 09:16:04.335173", + "step": 3523, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:04.531971", + "step": 3523, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19072581827640533, + "timestamp": "2025-09-05 09:16:04.546028", + "step": 3524, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:04.732763", + "step": 3524, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4346585273742676, + "timestamp": "2025-09-05 09:16:04.735731", + "step": 3525, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:04.929976", + "step": 3525, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30274319648742676, + "timestamp": "2025-09-05 09:16:04.932300", + "step": 3526, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:05.128812", + "step": 3526, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2595900893211365, + "timestamp": "2025-09-05 09:16:05.130772", + "step": 3527, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:05.324734", + "step": 3527, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20915935933589935, + "timestamp": "2025-09-05 09:16:05.340930", + "step": 3528, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:05.536897", + "step": 3528, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26858463883399963, + "timestamp": "2025-09-05 09:16:05.539280", + "step": 3529, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:05.705787", + "step": 3529, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35047516226768494, + "timestamp": "2025-09-05 09:16:05.708011", + "step": 3530, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:05.911338", + "step": 3530, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2243708372116089, + "timestamp": "2025-09-05 09:16:05.913729", + "step": 3531, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:06.082348", + "step": 3531, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19937127828598022, + "timestamp": "2025-09-05 09:16:06.098218", + "step": 3532, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:06.294245", + "step": 3532, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17561663687229156, + "timestamp": "2025-09-05 09:16:06.296290", + "step": 3533, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:06.497951", + "step": 3533, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1659424901008606, + "timestamp": "2025-09-05 09:16:06.499835", + "step": 3534, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:06.696327", + "step": 3534, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28922516107559204, + "timestamp": "2025-09-05 09:16:06.699244", + "step": 3535, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:06.896587", + "step": 3535, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.13516835868358612, + "timestamp": "2025-09-05 09:16:06.911478", + "step": 3536, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:07.110059", + "step": 3536, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.217886820435524, + "timestamp": "2025-09-05 09:16:07.112524", + "step": 3537, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:07.308950", + "step": 3537, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2815990149974823, + "timestamp": "2025-09-05 09:16:07.310854", + "step": 3538, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:07.477125", + "step": 3538, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34248197078704834, + "timestamp": "2025-09-05 09:16:07.479196", + "step": 3539, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:07.683880", + "step": 3539, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35142749547958374, + "timestamp": "2025-09-05 09:16:07.700213", + "step": 3540, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:12.334278", + "step": 3540, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.18878524249908, + "timestamp": "2025-09-05 09:16:12.336402", + "step": 3540, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:12.495423", + "step": 3540, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2691498398780823, + "timestamp": "2025-09-05 09:16:12.497432", + "step": 3541, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:12.703100", + "step": 3541, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21435484290122986, + "timestamp": "2025-09-05 09:16:12.705330", + "step": 3542, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:12.901321", + "step": 3542, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.257010281085968, + "timestamp": "2025-09-05 09:16:12.903276", + "step": 3543, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:13.107816", + "step": 3543, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31392380595207214, + "timestamp": "2025-09-05 09:16:13.121641", + "step": 3544, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:13.316878", + "step": 3544, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2716064751148224, + "timestamp": "2025-09-05 09:16:13.318958", + "step": 3545, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:13.483554", + "step": 3545, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33304086327552795, + "timestamp": "2025-09-05 09:16:13.486119", + "step": 3546, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:16:13.688428", + "step": 3546, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3453178405761719, + "timestamp": "2025-09-05 09:16:13.690938", + "step": 3547, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:13.890286", + "step": 3547, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3486884832382202, + "timestamp": "2025-09-05 09:16:13.907139", + "step": 3548, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:14.103328", + "step": 3548, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44889482855796814, + "timestamp": "2025-09-05 09:16:14.106268", + "step": 3549, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:14.302303", + "step": 3549, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2349972277879715, + "timestamp": "2025-09-05 09:16:14.304333", + "step": 3550, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:14.503881", + "step": 3550, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18555277585983276, + "timestamp": "2025-09-05 09:16:14.505559", + "step": 3551, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:14.701071", + "step": 3551, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34614723920822144, + "timestamp": "2025-09-05 09:16:14.714520", + "step": 3552, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:14.902023", + "step": 3552, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22294148802757263, + "timestamp": "2025-09-05 09:16:14.903759", + "step": 3553, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:15.099938", + "step": 3553, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3558078706264496, + "timestamp": "2025-09-05 09:16:15.102509", + "step": 3554, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:15.295938", + "step": 3554, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20311148464679718, + "timestamp": "2025-09-05 09:16:15.297998", + "step": 3555, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:15.492813", + "step": 3555, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2172691971063614, + "timestamp": "2025-09-05 09:16:15.508345", + "step": 3556, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:15.702178", + "step": 3556, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29311197996139526, + "timestamp": "2025-09-05 09:16:15.704350", + "step": 3557, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:15.907404", + "step": 3557, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31963905692100525, + "timestamp": "2025-09-05 09:16:15.909545", + "step": 3558, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:16.111802", + "step": 3558, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.47033655643463135, + "timestamp": "2025-09-05 09:16:16.113736", + "step": 3559, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:16.307815", + "step": 3559, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34976720809936523, + "timestamp": "2025-09-05 09:16:16.321703", + "step": 3560, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:21.188059", + "step": 3560, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.505925013612185, + "timestamp": "2025-09-05 09:16:21.190112", + "step": 3560, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3560", + "timestamp": "2025-09-05 09:16:21.625686", + "step": 3560, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:21.786287", + "step": 3560, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33070307970046997, + "timestamp": "2025-09-05 09:16:21.788767", + "step": 3561, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:21.953471", + "step": 3561, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2531128227710724, + "timestamp": "2025-09-05 09:16:21.955557", + "step": 3562, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:22.148695", + "step": 3562, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3364905118942261, + "timestamp": "2025-09-05 09:16:22.150816", + "step": 3563, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:22.321957", + "step": 3563, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2518615424633026, + "timestamp": "2025-09-05 09:16:22.335784", + "step": 3564, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:22.489115", + "step": 3564, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35287201404571533, + "timestamp": "2025-09-05 09:16:22.491003", + "step": 3565, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:22.649868", + "step": 3565, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31937721371650696, + "timestamp": "2025-09-05 09:16:22.651695", + "step": 3566, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:22.821112", + "step": 3566, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.329274445772171, + "timestamp": "2025-09-05 09:16:22.823418", + "step": 3567, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:22.981071", + "step": 3567, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23146797716617584, + "timestamp": "2025-09-05 09:16:22.994974", + "step": 3568, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:23.147220", + "step": 3568, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3371192514896393, + "timestamp": "2025-09-05 09:16:23.149264", + "step": 3569, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:23.309883", + "step": 3569, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30641764402389526, + "timestamp": "2025-09-05 09:16:23.312208", + "step": 3570, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:23.470545", + "step": 3570, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3931610882282257, + "timestamp": "2025-09-05 09:16:23.472232", + "step": 3571, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:23.628899", + "step": 3571, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2743593454360962, + "timestamp": "2025-09-05 09:16:23.642205", + "step": 3572, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:23.793306", + "step": 3572, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19696570932865143, + "timestamp": "2025-09-05 09:16:23.795320", + "step": 3573, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:23.964282", + "step": 3573, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31469157338142395, + "timestamp": "2025-09-05 09:16:23.966092", + "step": 3574, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:24.124842", + "step": 3574, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35265278816223145, + "timestamp": "2025-09-05 09:16:24.126761", + "step": 3575, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:24.285175", + "step": 3575, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2801409959793091, + "timestamp": "2025-09-05 09:16:24.299057", + "step": 3576, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:24.450733", + "step": 3576, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26633474230766296, + "timestamp": "2025-09-05 09:16:24.453513", + "step": 3577, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:24.610965", + "step": 3577, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3046843707561493, + "timestamp": "2025-09-05 09:16:24.612825", + "step": 3578, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:24.769922", + "step": 3578, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3721226453781128, + "timestamp": "2025-09-05 09:16:24.771766", + "step": 3579, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:24.931204", + "step": 3579, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.264265239238739, + "timestamp": "2025-09-05 09:16:24.945012", + "step": 3580, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:29.567482", + "step": 3580, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.221559294521626, + "timestamp": "2025-09-05 09:16:29.569634", + "step": 3580, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:29.702774", + "step": 3580, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27857106924057007, + "timestamp": "2025-09-05 09:16:29.705099", + "step": 3581, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:29.841394", + "step": 3581, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35566446185112, + "timestamp": "2025-09-05 09:16:29.843601", + "step": 3582, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:30.016043", + "step": 3582, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2681717574596405, + "timestamp": "2025-09-05 09:16:30.018309", + "step": 3583, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:30.183556", + "step": 3583, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29908114671707153, + "timestamp": "2025-09-05 09:16:30.199639", + "step": 3584, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:30.363380", + "step": 3584, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32012149691581726, + "timestamp": "2025-09-05 09:16:30.365411", + "step": 3585, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:30.534700", + "step": 3585, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1617680937051773, + "timestamp": "2025-09-05 09:16:30.536749", + "step": 3586, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:30.711043", + "step": 3586, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18636199831962585, + "timestamp": "2025-09-05 09:16:30.713254", + "step": 3587, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:30.877559", + "step": 3587, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26028791069984436, + "timestamp": "2025-09-05 09:16:30.891716", + "step": 3588, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:31.048008", + "step": 3588, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23478567600250244, + "timestamp": "2025-09-05 09:16:31.050538", + "step": 3589, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:31.214481", + "step": 3589, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25303134322166443, + "timestamp": "2025-09-05 09:16:31.216538", + "step": 3590, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:31.353478", + "step": 3590, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3114836812019348, + "timestamp": "2025-09-05 09:16:31.355987", + "step": 3591, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:31.491787", + "step": 3591, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.341848224401474, + "timestamp": "2025-09-05 09:16:31.507779", + "step": 3592, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:31.672999", + "step": 3592, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34669381380081177, + "timestamp": "2025-09-05 09:16:31.675020", + "step": 3593, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:31.839374", + "step": 3593, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24734577536582947, + "timestamp": "2025-09-05 09:16:31.841260", + "step": 3594, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:31.979773", + "step": 3594, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2852213978767395, + "timestamp": "2025-09-05 09:16:31.982039", + "step": 3595, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:32.154180", + "step": 3595, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20489144325256348, + "timestamp": "2025-09-05 09:16:32.168329", + "step": 3596, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:32.328468", + "step": 3596, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2507217824459076, + "timestamp": "2025-09-05 09:16:32.330751", + "step": 3597, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:32.494693", + "step": 3597, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29786399006843567, + "timestamp": "2025-09-05 09:16:32.496588", + "step": 3598, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:32.669476", + "step": 3598, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31531691551208496, + "timestamp": "2025-09-05 09:16:32.671583", + "step": 3599, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:32.845699", + "step": 3599, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28864872455596924, + "timestamp": "2025-09-05 09:16:32.862166", + "step": 3600, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:37.515636", + "step": 3600, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.849603424621414, + "timestamp": "2025-09-05 09:16:37.517453", + "step": 3600, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3600", + "timestamp": "2025-09-05 09:16:37.973586", + "step": 3600, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:38.113656", + "step": 3600, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3407289683818817, + "timestamp": "2025-09-05 09:16:38.115499", + "step": 3601, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:38.313738", + "step": 3601, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2807539999485016, + "timestamp": "2025-09-05 09:16:38.315728", + "step": 3602, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:38.481679", + "step": 3602, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2940472662448883, + "timestamp": "2025-09-05 09:16:38.483627", + "step": 3603, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:38.650368", + "step": 3603, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3419131934642792, + "timestamp": "2025-09-05 09:16:38.663614", + "step": 3604, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:38.821500", + "step": 3604, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2667962610721588, + "timestamp": "2025-09-05 09:16:38.824940", + "step": 3605, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:38.989583", + "step": 3605, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3813531696796417, + "timestamp": "2025-09-05 09:16:38.991762", + "step": 3606, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:39.156661", + "step": 3606, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21620894968509674, + "timestamp": "2025-09-05 09:16:39.158848", + "step": 3607, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:39.322673", + "step": 3607, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28690725564956665, + "timestamp": "2025-09-05 09:16:39.337856", + "step": 3608, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:39.504144", + "step": 3608, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2018442302942276, + "timestamp": "2025-09-05 09:16:39.506367", + "step": 3609, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:39.694573", + "step": 3609, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24246446788311005, + "timestamp": "2025-09-05 09:16:39.696675", + "step": 3610, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:39.903678", + "step": 3610, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2503988444805145, + "timestamp": "2025-09-05 09:16:39.905565", + "step": 3611, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:40.101897", + "step": 3611, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2636902928352356, + "timestamp": "2025-09-05 09:16:40.114992", + "step": 3612, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:40.309381", + "step": 3612, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2856784164905548, + "timestamp": "2025-09-05 09:16:40.311717", + "step": 3613, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:40.507200", + "step": 3613, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21770760416984558, + "timestamp": "2025-09-05 09:16:40.509303", + "step": 3614, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:40.712550", + "step": 3614, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3094342350959778, + "timestamp": "2025-09-05 09:16:40.714794", + "step": 3615, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:40.921171", + "step": 3615, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22260645031929016, + "timestamp": "2025-09-05 09:16:40.937490", + "step": 3616, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:41.132556", + "step": 3616, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2806592881679535, + "timestamp": "2025-09-05 09:16:41.134659", + "step": 3617, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:41.330202", + "step": 3617, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2558315098285675, + "timestamp": "2025-09-05 09:16:41.332341", + "step": 3618, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:41.535714", + "step": 3618, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29388701915740967, + "timestamp": "2025-09-05 09:16:41.538011", + "step": 3619, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:41.735923", + "step": 3619, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.233632892370224, + "timestamp": "2025-09-05 09:16:41.750243", + "step": 3620, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:46.373597", + "step": 3620, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.65468023616325, + "timestamp": "2025-09-05 09:16:46.375753", + "step": 3620, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:46.536738", + "step": 3620, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3380981683731079, + "timestamp": "2025-09-05 09:16:46.539311", + "step": 3621, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:46.703329", + "step": 3621, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44205331802368164, + "timestamp": "2025-09-05 09:16:46.705667", + "step": 3622, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:46.911147", + "step": 3622, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22291947901248932, + "timestamp": "2025-09-05 09:16:46.913490", + "step": 3623, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:47.117238", + "step": 3623, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2371853142976761, + "timestamp": "2025-09-05 09:16:47.134683", + "step": 3624, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:47.325947", + "step": 3624, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3914877772331238, + "timestamp": "2025-09-05 09:16:47.327818", + "step": 3625, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:47.522207", + "step": 3625, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18587626516819, + "timestamp": "2025-09-05 09:16:47.524016", + "step": 3626, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:47.727008", + "step": 3626, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33849427103996277, + "timestamp": "2025-09-05 09:16:47.728996", + "step": 3627, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:47.932545", + "step": 3627, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30909058451652527, + "timestamp": "2025-09-05 09:16:47.948909", + "step": 3628, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:48.145519", + "step": 3628, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3659791350364685, + "timestamp": "2025-09-05 09:16:48.147519", + "step": 3629, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:48.352926", + "step": 3629, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33907654881477356, + "timestamp": "2025-09-05 09:16:48.354868", + "step": 3630, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:48.549234", + "step": 3630, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20109602808952332, + "timestamp": "2025-09-05 09:16:48.551245", + "step": 3631, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:48.756040", + "step": 3631, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20491234958171844, + "timestamp": "2025-09-05 09:16:48.769919", + "step": 3632, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:48.959793", + "step": 3632, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22076480090618134, + "timestamp": "2025-09-05 09:16:48.961632", + "step": 3633, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:49.156216", + "step": 3633, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3952508568763733, + "timestamp": "2025-09-05 09:16:49.158140", + "step": 3634, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:49.353843", + "step": 3634, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.365893691778183, + "timestamp": "2025-09-05 09:16:49.356162", + "step": 3635, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:49.552509", + "step": 3635, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4604712426662445, + "timestamp": "2025-09-05 09:16:49.566332", + "step": 3636, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:49.753640", + "step": 3636, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2598891258239746, + "timestamp": "2025-09-05 09:16:49.755677", + "step": 3637, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:49.952092", + "step": 3637, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19562163949012756, + "timestamp": "2025-09-05 09:16:49.954362", + "step": 3638, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:16:50.150810", + "step": 3638, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3006066679954529, + "timestamp": "2025-09-05 09:16:50.152516", + "step": 3639, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:50.357244", + "step": 3639, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26009029150009155, + "timestamp": "2025-09-05 09:16:50.370461", + "step": 3640, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:16:55.002230", + "step": 3640, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.14356651280716, + "timestamp": "2025-09-05 09:16:55.003937", + "step": 3640, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3640", + "timestamp": "2025-09-05 09:16:55.470108", + "step": 3640, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:55.637860", + "step": 3640, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33125096559524536, + "timestamp": "2025-09-05 09:16:55.640180", + "step": 3641, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:55.834264", + "step": 3641, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29791468381881714, + "timestamp": "2025-09-05 09:16:55.836217", + "step": 3642, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:56.037864", + "step": 3642, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22799059748649597, + "timestamp": "2025-09-05 09:16:56.040517", + "step": 3643, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:56.237007", + "step": 3643, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19178558886051178, + "timestamp": "2025-09-05 09:16:56.251290", + "step": 3644, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:56.437268", + "step": 3644, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2333633154630661, + "timestamp": "2025-09-05 09:16:56.439316", + "step": 3645, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:16:56.641884", + "step": 3645, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3371458053588867, + "timestamp": "2025-09-05 09:16:56.643663", + "step": 3646, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:16:56.838438", + "step": 3646, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2189522385597229, + "timestamp": "2025-09-05 09:16:56.840576", + "step": 3647, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:57.036412", + "step": 3647, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20619438588619232, + "timestamp": "2025-09-05 09:16:57.052386", + "step": 3648, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:57.247348", + "step": 3648, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3777850568294525, + "timestamp": "2025-09-05 09:16:57.249655", + "step": 3649, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:57.453023", + "step": 3649, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19339697062969208, + "timestamp": "2025-09-05 09:16:57.455107", + "step": 3650, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:57.650129", + "step": 3650, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24509845674037933, + "timestamp": "2025-09-05 09:16:57.653034", + "step": 3651, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:57.851212", + "step": 3651, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20124991238117218, + "timestamp": "2025-09-05 09:16:57.865147", + "step": 3652, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:58.053115", + "step": 3652, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2905905246734619, + "timestamp": "2025-09-05 09:16:58.055367", + "step": 3653, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:58.259967", + "step": 3653, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3718425929546356, + "timestamp": "2025-09-05 09:16:58.262379", + "step": 3654, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:58.469732", + "step": 3654, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.278747022151947, + "timestamp": "2025-09-05 09:16:58.472249", + "step": 3655, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:58.637485", + "step": 3655, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.15937921404838562, + "timestamp": "2025-09-05 09:16:58.654440", + "step": 3656, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:58.853334", + "step": 3656, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25874295830726624, + "timestamp": "2025-09-05 09:16:58.855297", + "step": 3657, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:16:59.060556", + "step": 3657, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29497626423835754, + "timestamp": "2025-09-05 09:16:59.062998", + "step": 3658, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:16:59.260298", + "step": 3658, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31460994482040405, + "timestamp": "2025-09-05 09:16:59.262889", + "step": 3659, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:16:59.468470", + "step": 3659, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23476682603359222, + "timestamp": "2025-09-05 09:16:59.482345", + "step": 3660, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:04.179168", + "step": 3660, + "epoch": 3 + }, + { + "type": "pplx", + "content": 58.84672962747659, + "timestamp": "2025-09-05 09:17:04.183501", + "step": 3660, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:17:04.345280", + "step": 3660, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2967011034488678, + "timestamp": "2025-09-05 09:17:04.347457", + "step": 3661, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:04.486037", + "step": 3661, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20157547295093536, + "timestamp": "2025-09-05 09:17:04.491324", + "step": 3662, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:04.669365", + "step": 3662, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29696306586265564, + "timestamp": "2025-09-05 09:17:04.671800", + "step": 3663, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:04.836191", + "step": 3663, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31658288836479187, + "timestamp": "2025-09-05 09:17:04.845138", + "step": 3664, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:04.981361", + "step": 3664, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31298524141311646, + "timestamp": "2025-09-05 09:17:04.986225", + "step": 3665, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:05.204135", + "step": 3665, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24473224580287933, + "timestamp": "2025-09-05 09:17:05.207421", + "step": 3666, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:05.386416", + "step": 3666, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30507656931877136, + "timestamp": "2025-09-05 09:17:05.388406", + "step": 3667, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:05.562709", + "step": 3667, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3829200565814972, + "timestamp": "2025-09-05 09:17:05.572358", + "step": 3668, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:17:05.707364", + "step": 3668, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21303744614124298, + "timestamp": "2025-09-05 09:17:05.709248", + "step": 3669, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:05.873112", + "step": 3669, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19828033447265625, + "timestamp": "2025-09-05 09:17:05.875438", + "step": 3670, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:06.044585", + "step": 3670, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19999150931835175, + "timestamp": "2025-09-05 09:17:06.046676", + "step": 3671, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:06.210600", + "step": 3671, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20807354152202606, + "timestamp": "2025-09-05 09:17:06.224093", + "step": 3672, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:06.381194", + "step": 3672, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.337863951921463, + "timestamp": "2025-09-05 09:17:06.383013", + "step": 3673, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:06.548655", + "step": 3673, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.43848153948783875, + "timestamp": "2025-09-05 09:17:06.550812", + "step": 3674, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:06.726192", + "step": 3674, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3983319401741028, + "timestamp": "2025-09-05 09:17:06.728124", + "step": 3675, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:06.902251", + "step": 3675, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18885241448879242, + "timestamp": "2025-09-05 09:17:06.917409", + "step": 3676, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:07.095542", + "step": 3676, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22875548899173737, + "timestamp": "2025-09-05 09:17:07.097896", + "step": 3677, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:07.261387", + "step": 3677, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19291433691978455, + "timestamp": "2025-09-05 09:17:07.263485", + "step": 3678, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:07.436357", + "step": 3678, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2069120854139328, + "timestamp": "2025-09-05 09:17:07.438453", + "step": 3679, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:07.613336", + "step": 3679, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27292466163635254, + "timestamp": "2025-09-05 09:17:07.627569", + "step": 3680, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:12.442469", + "step": 3680, + "epoch": 3 + }, + { + "type": "pplx", + "content": 58.00292293677735, + "timestamp": "2025-09-05 09:17:12.444593", + "step": 3680, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3680", + "timestamp": "2025-09-05 09:17:12.923472", + "step": 3680, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:13.096349", + "step": 3680, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28774937987327576, + "timestamp": "2025-09-05 09:17:13.099635", + "step": 3681, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:13.270133", + "step": 3681, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29541826248168945, + "timestamp": "2025-09-05 09:17:13.272636", + "step": 3682, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:13.446220", + "step": 3682, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20348146557807922, + "timestamp": "2025-09-05 09:17:13.448403", + "step": 3683, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:13.621708", + "step": 3683, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27651840448379517, + "timestamp": "2025-09-05 09:17:13.637686", + "step": 3684, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:13.803773", + "step": 3684, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.16801585257053375, + "timestamp": "2025-09-05 09:17:13.811331", + "step": 3685, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:13.977709", + "step": 3685, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27807697653770447, + "timestamp": "2025-09-05 09:17:13.990085", + "step": 3686, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:14.174891", + "step": 3686, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28929510712623596, + "timestamp": "2025-09-05 09:17:14.176862", + "step": 3687, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:14.342954", + "step": 3687, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33495303988456726, + "timestamp": "2025-09-05 09:17:14.359164", + "step": 3688, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:14.525050", + "step": 3688, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.40975499153137207, + "timestamp": "2025-09-05 09:17:14.530323", + "step": 3689, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:14.707766", + "step": 3689, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34368452429771423, + "timestamp": "2025-09-05 09:17:14.711194", + "step": 3690, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:14.885111", + "step": 3690, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21246421337127686, + "timestamp": "2025-09-05 09:17:14.888437", + "step": 3691, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:15.063534", + "step": 3691, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17844977974891663, + "timestamp": "2025-09-05 09:17:15.079004", + "step": 3692, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:15.247480", + "step": 3692, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23904670774936676, + "timestamp": "2025-09-05 09:17:15.250098", + "step": 3693, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:17:15.419302", + "step": 3693, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2476682811975479, + "timestamp": "2025-09-05 09:17:15.421347", + "step": 3694, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:15.585971", + "step": 3694, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.14145521819591522, + "timestamp": "2025-09-05 09:17:15.587978", + "step": 3695, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:15.755238", + "step": 3695, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2892580032348633, + "timestamp": "2025-09-05 09:17:15.769585", + "step": 3696, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:15.933810", + "step": 3696, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20448994636535645, + "timestamp": "2025-09-05 09:17:15.935738", + "step": 3697, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:16.110181", + "step": 3697, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30457159876823425, + "timestamp": "2025-09-05 09:17:16.112415", + "step": 3698, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:16.278273", + "step": 3698, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19800890982151031, + "timestamp": "2025-09-05 09:17:16.281130", + "step": 3699, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:16.447546", + "step": 3699, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.10996751487255096, + "timestamp": "2025-09-05 09:17:16.464001", + "step": 3700, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:21.189675", + "step": 3700, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.65837507121739, + "timestamp": "2025-09-05 09:17:21.192210", + "step": 3700, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:21.325124", + "step": 3700, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35211700201034546, + "timestamp": "2025-09-05 09:17:21.328555", + "step": 3701, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:21.478549", + "step": 3701, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2543327510356903, + "timestamp": "2025-09-05 09:17:21.483201", + "step": 3702, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:21.661615", + "step": 3702, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.10399866104125977, + "timestamp": "2025-09-05 09:17:21.663790", + "step": 3703, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:21.839507", + "step": 3703, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17878511548042297, + "timestamp": "2025-09-05 09:17:21.853699", + "step": 3704, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:22.023690", + "step": 3704, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25016236305236816, + "timestamp": "2025-09-05 09:17:22.026198", + "step": 3705, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:22.190767", + "step": 3705, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.356041818857193, + "timestamp": "2025-09-05 09:17:22.192750", + "step": 3706, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:22.353308", + "step": 3706, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31788352131843567, + "timestamp": "2025-09-05 09:17:22.355417", + "step": 3707, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 272 + ], + "flops": 5440033091648.0 + }, + "timestamp": "2025-09-05 09:17:22.513433", + "step": 3707, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4455195367336273, + "timestamp": "2025-09-05 09:17:22.529158", + "step": 3708, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:22.694433", + "step": 3708, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1478017121553421, + "timestamp": "2025-09-05 09:17:22.696732", + "step": 3709, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:22.865305", + "step": 3709, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30112531781196594, + "timestamp": "2025-09-05 09:17:22.867849", + "step": 3710, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:23.040859", + "step": 3710, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24474984407424927, + "timestamp": "2025-09-05 09:17:23.043598", + "step": 3711, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:23.202386", + "step": 3711, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2871285080909729, + "timestamp": "2025-09-05 09:17:23.217262", + "step": 3712, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:23.370485", + "step": 3712, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2523558437824249, + "timestamp": "2025-09-05 09:17:23.373047", + "step": 3713, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:23.533342", + "step": 3713, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1851450353860855, + "timestamp": "2025-09-05 09:17:23.535368", + "step": 3714, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:23.712360", + "step": 3714, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41676124930381775, + "timestamp": "2025-09-05 09:17:23.714662", + "step": 3715, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:23.886528", + "step": 3715, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28601813316345215, + "timestamp": "2025-09-05 09:17:23.903673", + "step": 3716, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:24.067942", + "step": 3716, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2414342612028122, + "timestamp": "2025-09-05 09:17:24.070264", + "step": 3717, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:24.236463", + "step": 3717, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28098100423812866, + "timestamp": "2025-09-05 09:17:24.239880", + "step": 3718, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:24.398819", + "step": 3718, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2182604819536209, + "timestamp": "2025-09-05 09:17:24.402058", + "step": 3719, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:24.569550", + "step": 3719, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21126191318035126, + "timestamp": "2025-09-05 09:17:24.587286", + "step": 3720, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:29.301300", + "step": 3720, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.851098754460935, + "timestamp": "2025-09-05 09:17:29.303386", + "step": 3720, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3720", + "timestamp": "2025-09-05 09:17:29.780448", + "step": 3720, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:29.918245", + "step": 3720, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17629235982894897, + "timestamp": "2025-09-05 09:17:29.920249", + "step": 3721, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:30.077145", + "step": 3721, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19920383393764496, + "timestamp": "2025-09-05 09:17:30.080108", + "step": 3722, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:30.257935", + "step": 3722, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1470199078321457, + "timestamp": "2025-09-05 09:17:30.260188", + "step": 3723, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:30.421805", + "step": 3723, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3522961735725403, + "timestamp": "2025-09-05 09:17:30.434868", + "step": 3724, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:30.588266", + "step": 3724, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22870679199695587, + "timestamp": "2025-09-05 09:17:30.590259", + "step": 3725, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:30.751751", + "step": 3725, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.35698115825653076, + "timestamp": "2025-09-05 09:17:30.754837", + "step": 3726, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:30.914440", + "step": 3726, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2308841049671173, + "timestamp": "2025-09-05 09:17:30.916372", + "step": 3727, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:31.089362", + "step": 3727, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30304333567619324, + "timestamp": "2025-09-05 09:17:31.103816", + "step": 3728, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:31.258454", + "step": 3728, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4719480872154236, + "timestamp": "2025-09-05 09:17:31.261128", + "step": 3729, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:31.419266", + "step": 3729, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23239001631736755, + "timestamp": "2025-09-05 09:17:31.421208", + "step": 3730, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:31.594334", + "step": 3730, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25132450461387634, + "timestamp": "2025-09-05 09:17:31.596356", + "step": 3731, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:31.769554", + "step": 3731, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2377987802028656, + "timestamp": "2025-09-05 09:17:31.782682", + "step": 3732, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:31.935284", + "step": 3732, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21928219497203827, + "timestamp": "2025-09-05 09:17:31.937514", + "step": 3733, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:32.094333", + "step": 3733, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32776015996932983, + "timestamp": "2025-09-05 09:17:32.096980", + "step": 3734, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:32.257921", + "step": 3734, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3067801594734192, + "timestamp": "2025-09-05 09:17:32.260923", + "step": 3735, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:32.433004", + "step": 3735, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2571111023426056, + "timestamp": "2025-09-05 09:17:32.450589", + "step": 3736, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:32.617665", + "step": 3736, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31286391615867615, + "timestamp": "2025-09-05 09:17:32.621410", + "step": 3737, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:32.804127", + "step": 3737, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1266995221376419, + "timestamp": "2025-09-05 09:17:32.806894", + "step": 3738, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:32.989557", + "step": 3738, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18480810523033142, + "timestamp": "2025-09-05 09:17:32.993892", + "step": 3739, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:33.157154", + "step": 3739, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21677125990390778, + "timestamp": "2025-09-05 09:17:33.174230", + "step": 3740, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:37.915992", + "step": 3740, + "epoch": 3 + }, + { + "type": "pplx", + "content": 58.138925363076176, + "timestamp": "2025-09-05 09:17:37.919958", + "step": 3740, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:38.054986", + "step": 3740, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17171865701675415, + "timestamp": "2025-09-05 09:17:38.058251", + "step": 3741, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:38.194742", + "step": 3741, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.11303048580884933, + "timestamp": "2025-09-05 09:17:38.196873", + "step": 3742, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:38.367087", + "step": 3742, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29843342304229736, + "timestamp": "2025-09-05 09:17:38.369146", + "step": 3743, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:38.528710", + "step": 3743, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28549709916114807, + "timestamp": "2025-09-05 09:17:38.545287", + "step": 3744, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:38.704412", + "step": 3744, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3105151653289795, + "timestamp": "2025-09-05 09:17:38.707561", + "step": 3745, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:38.870199", + "step": 3745, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2878493368625641, + "timestamp": "2025-09-05 09:17:38.874012", + "step": 3746, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:39.047543", + "step": 3746, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31547924876213074, + "timestamp": "2025-09-05 09:17:39.049871", + "step": 3747, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:17:39.220202", + "step": 3747, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4833051860332489, + "timestamp": "2025-09-05 09:17:39.234546", + "step": 3748, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:39.394896", + "step": 3748, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21690459549427032, + "timestamp": "2025-09-05 09:17:39.396991", + "step": 3749, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:39.567268", + "step": 3749, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24650557339191437, + "timestamp": "2025-09-05 09:17:39.572449", + "step": 3750, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:39.741704", + "step": 3750, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3864779770374298, + "timestamp": "2025-09-05 09:17:39.744322", + "step": 3751, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:39.929467", + "step": 3751, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.271431028842926, + "timestamp": "2025-09-05 09:17:39.946728", + "step": 3752, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:40.129181", + "step": 3752, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2465740144252777, + "timestamp": "2025-09-05 09:17:40.131170", + "step": 3753, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:40.290242", + "step": 3753, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18188771605491638, + "timestamp": "2025-09-05 09:17:40.293833", + "step": 3754, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:40.453968", + "step": 3754, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3101077079772949, + "timestamp": "2025-09-05 09:17:40.465944", + "step": 3755, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:40.631092", + "step": 3755, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3483135402202606, + "timestamp": "2025-09-05 09:17:40.648326", + "step": 3756, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:40.816767", + "step": 3756, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3474515676498413, + "timestamp": "2025-09-05 09:17:40.820076", + "step": 3757, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:40.995965", + "step": 3757, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3206429183483124, + "timestamp": "2025-09-05 09:17:40.998901", + "step": 3758, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:41.157141", + "step": 3758, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3244212567806244, + "timestamp": "2025-09-05 09:17:41.159484", + "step": 3759, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:17:41.332000", + "step": 3759, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1970166712999344, + "timestamp": "2025-09-05 09:17:41.348675", + "step": 3760, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:46.106629", + "step": 3760, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.030742259845105, + "timestamp": "2025-09-05 09:17:46.109476", + "step": 3760, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3760", + "timestamp": "2025-09-05 09:17:46.628384", + "step": 3760, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:46.817598", + "step": 3760, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24627402424812317, + "timestamp": "2025-09-05 09:17:46.819504", + "step": 3761, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:47.015458", + "step": 3761, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4071979224681854, + "timestamp": "2025-09-05 09:17:47.017803", + "step": 3762, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:47.225117", + "step": 3762, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2320803850889206, + "timestamp": "2025-09-05 09:17:47.228767", + "step": 3763, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:47.431687", + "step": 3763, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2511028051376343, + "timestamp": "2025-09-05 09:17:47.447973", + "step": 3764, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:47.644725", + "step": 3764, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1637968271970749, + "timestamp": "2025-09-05 09:17:47.646815", + "step": 3765, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:47.855989", + "step": 3765, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4574233293533325, + "timestamp": "2025-09-05 09:17:47.858374", + "step": 3766, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:48.053732", + "step": 3766, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2990650236606598, + "timestamp": "2025-09-05 09:17:48.055922", + "step": 3767, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:48.251155", + "step": 3767, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.34469056129455566, + "timestamp": "2025-09-05 09:17:48.264540", + "step": 3768, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:48.451902", + "step": 3768, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1231929287314415, + "timestamp": "2025-09-05 09:17:48.454219", + "step": 3769, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:48.650029", + "step": 3769, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22926165163516998, + "timestamp": "2025-09-05 09:17:48.652239", + "step": 3770, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:48.855961", + "step": 3770, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1999165564775467, + "timestamp": "2025-09-05 09:17:48.858063", + "step": 3771, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:49.053725", + "step": 3771, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29211559891700745, + "timestamp": "2025-09-05 09:17:49.068234", + "step": 3772, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:49.260693", + "step": 3772, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2825232446193695, + "timestamp": "2025-09-05 09:17:49.265059", + "step": 3773, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:49.468313", + "step": 3773, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2714124917984009, + "timestamp": "2025-09-05 09:17:49.473227", + "step": 3774, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:49.685717", + "step": 3774, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3337889611721039, + "timestamp": "2025-09-05 09:17:49.692216", + "step": 3775, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:49.897333", + "step": 3775, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28709539771080017, + "timestamp": "2025-09-05 09:17:49.911174", + "step": 3776, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:50.100013", + "step": 3776, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3871142268180847, + "timestamp": "2025-09-05 09:17:50.101949", + "step": 3777, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:50.294723", + "step": 3777, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41545554995536804, + "timestamp": "2025-09-05 09:17:50.296735", + "step": 3778, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:50.491618", + "step": 3778, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33116820454597473, + "timestamp": "2025-09-05 09:17:50.494035", + "step": 3779, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:50.689535", + "step": 3779, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3106461763381958, + "timestamp": "2025-09-05 09:17:50.705678", + "step": 3780, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:17:55.438519", + "step": 3780, + "epoch": 3 + }, + { + "type": "pplx", + "content": 54.86477109539171, + "timestamp": "2025-09-05 09:17:55.440640", + "step": 3780, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:55.602465", + "step": 3780, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2829877734184265, + "timestamp": "2025-09-05 09:17:55.604819", + "step": 3781, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:55.772017", + "step": 3781, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23081792891025543, + "timestamp": "2025-09-05 09:17:55.774253", + "step": 3782, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:55.978453", + "step": 3782, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2172662913799286, + "timestamp": "2025-09-05 09:17:55.980547", + "step": 3783, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:56.174995", + "step": 3783, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23292969167232513, + "timestamp": "2025-09-05 09:17:56.191374", + "step": 3784, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:56.399032", + "step": 3784, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28087952733039856, + "timestamp": "2025-09-05 09:17:56.404084", + "step": 3785, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:56.613247", + "step": 3785, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3004533052444458, + "timestamp": "2025-09-05 09:17:56.615302", + "step": 3786, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:17:56.811021", + "step": 3786, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22735357284545898, + "timestamp": "2025-09-05 09:17:56.813173", + "step": 3787, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:57.008692", + "step": 3787, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.24109166860580444, + "timestamp": "2025-09-05 09:17:57.022332", + "step": 3788, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:57.210727", + "step": 3788, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25741469860076904, + "timestamp": "2025-09-05 09:17:57.212675", + "step": 3789, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:57.407077", + "step": 3789, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18140548467636108, + "timestamp": "2025-09-05 09:17:57.409333", + "step": 3790, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:57.612999", + "step": 3790, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2594696879386902, + "timestamp": "2025-09-05 09:17:57.615012", + "step": 3791, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:57.810111", + "step": 3791, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2717820703983307, + "timestamp": "2025-09-05 09:17:57.827484", + "step": 3792, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:17:58.038952", + "step": 3792, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2998480200767517, + "timestamp": "2025-09-05 09:17:58.041150", + "step": 3793, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:58.237041", + "step": 3793, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.5077316164970398, + "timestamp": "2025-09-05 09:17:58.241772", + "step": 3794, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:17:58.436343", + "step": 3794, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22378866374492645, + "timestamp": "2025-09-05 09:17:58.438145", + "step": 3795, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:17:58.632479", + "step": 3795, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2865094840526581, + "timestamp": "2025-09-05 09:17:58.646480", + "step": 3796, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:58.834363", + "step": 3796, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2359636276960373, + "timestamp": "2025-09-05 09:17:58.836348", + "step": 3797, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:59.039984", + "step": 3797, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2948862612247467, + "timestamp": "2025-09-05 09:17:59.041770", + "step": 3798, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:17:59.237989", + "step": 3798, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2476307600736618, + "timestamp": "2025-09-05 09:17:59.243974", + "step": 3799, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:17:59.445175", + "step": 3799, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2329883575439453, + "timestamp": "2025-09-05 09:17:59.460504", + "step": 3800, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:04.091371", + "step": 3800, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.55614732154259, + "timestamp": "2025-09-05 09:18:04.093508", + "step": 3800, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3800", + "timestamp": "2025-09-05 09:18:04.825315", + "step": 3800, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:04.992856", + "step": 3800, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30155929923057556, + "timestamp": "2025-09-05 09:18:04.994966", + "step": 3801, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:05.158446", + "step": 3801, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.41426223516464233, + "timestamp": "2025-09-05 09:18:05.164363", + "step": 3802, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:05.370875", + "step": 3802, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36889323592185974, + "timestamp": "2025-09-05 09:18:05.373209", + "step": 3803, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:05.569555", + "step": 3803, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2549019753932953, + "timestamp": "2025-09-05 09:18:05.585448", + "step": 3804, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:05.780737", + "step": 3804, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.46478819847106934, + "timestamp": "2025-09-05 09:18:05.783228", + "step": 3805, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:05.977843", + "step": 3805, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2971610724925995, + "timestamp": "2025-09-05 09:18:05.979793", + "step": 3806, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:06.175088", + "step": 3806, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18687593936920166, + "timestamp": "2025-09-05 09:18:06.177352", + "step": 3807, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:06.380428", + "step": 3807, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3274900019168854, + "timestamp": "2025-09-05 09:18:06.394606", + "step": 3808, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:06.582600", + "step": 3808, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21774989366531372, + "timestamp": "2025-09-05 09:18:06.584933", + "step": 3809, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:06.780351", + "step": 3809, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25480151176452637, + "timestamp": "2025-09-05 09:18:06.783183", + "step": 3810, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:06.988610", + "step": 3810, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28949615359306335, + "timestamp": "2025-09-05 09:18:06.991180", + "step": 3811, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:07.197046", + "step": 3811, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3601587116718292, + "timestamp": "2025-09-05 09:18:07.213966", + "step": 3812, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:07.409722", + "step": 3812, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2533739507198334, + "timestamp": "2025-09-05 09:18:07.412817", + "step": 3813, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:07.617938", + "step": 3813, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3216201364994049, + "timestamp": "2025-09-05 09:18:07.620227", + "step": 3814, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:07.823930", + "step": 3814, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.26064953207969666, + "timestamp": "2025-09-05 09:18:07.826145", + "step": 3815, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:08.029069", + "step": 3815, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4234212338924408, + "timestamp": "2025-09-05 09:18:08.045468", + "step": 3816, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:08.239569", + "step": 3816, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22362388670444489, + "timestamp": "2025-09-05 09:18:08.242035", + "step": 3817, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:08.444873", + "step": 3817, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3728334307670593, + "timestamp": "2025-09-05 09:18:08.447328", + "step": 3818, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:08.649347", + "step": 3818, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.46954891085624695, + "timestamp": "2025-09-05 09:18:08.651921", + "step": 3819, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:08.843539", + "step": 3819, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2965394854545593, + "timestamp": "2025-09-05 09:18:08.860185", + "step": 3820, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:13.496780", + "step": 3820, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.5854559436734, + "timestamp": "2025-09-05 09:18:13.499381", + "step": 3820, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:13.660779", + "step": 3820, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22472511231899261, + "timestamp": "2025-09-05 09:18:13.664259", + "step": 3821, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:13.866930", + "step": 3821, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25364920496940613, + "timestamp": "2025-09-05 09:18:13.868953", + "step": 3822, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:14.062950", + "step": 3822, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21925756335258484, + "timestamp": "2025-09-05 09:18:14.065090", + "step": 3823, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:14.259796", + "step": 3823, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2519068419933319, + "timestamp": "2025-09-05 09:18:14.270679", + "step": 3824, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:14.439212", + "step": 3824, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3214259743690491, + "timestamp": "2025-09-05 09:18:14.441266", + "step": 3825, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:14.644776", + "step": 3825, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30687010288238525, + "timestamp": "2025-09-05 09:18:14.648046", + "step": 3826, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:14.844746", + "step": 3826, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2706732451915741, + "timestamp": "2025-09-05 09:18:14.846875", + "step": 3827, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-05 09:18:15.039578", + "step": 3827, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21124766767024994, + "timestamp": "2025-09-05 09:18:15.053944", + "step": 3828, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:15.240514", + "step": 3828, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2378242313861847, + "timestamp": "2025-09-05 09:18:15.242970", + "step": 3829, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:15.436793", + "step": 3829, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.37138789892196655, + "timestamp": "2025-09-05 09:18:15.439225", + "step": 3830, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:15.636069", + "step": 3830, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.48219579458236694, + "timestamp": "2025-09-05 09:18:15.639027", + "step": 3831, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:15.844375", + "step": 3831, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31807440519332886, + "timestamp": "2025-09-05 09:18:15.853360", + "step": 3832, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:16.014950", + "step": 3832, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19948077201843262, + "timestamp": "2025-09-05 09:18:16.016914", + "step": 3833, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:16.222462", + "step": 3833, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3451708257198334, + "timestamp": "2025-09-05 09:18:16.227796", + "step": 3834, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:16.432408", + "step": 3834, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1525815725326538, + "timestamp": "2025-09-05 09:18:16.434547", + "step": 3835, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:16.629223", + "step": 3835, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.25581640005111694, + "timestamp": "2025-09-05 09:18:16.644719", + "step": 3836, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:16.841468", + "step": 3836, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1911073625087738, + "timestamp": "2025-09-05 09:18:16.843859", + "step": 3837, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:17.040110", + "step": 3837, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1686500757932663, + "timestamp": "2025-09-05 09:18:17.042298", + "step": 3838, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:17.248084", + "step": 3838, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.15377473831176758, + "timestamp": "2025-09-05 09:18:17.250236", + "step": 3839, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:17.446870", + "step": 3839, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.33481061458587646, + "timestamp": "2025-09-05 09:18:17.461064", + "step": 3840, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:22.080629", + "step": 3840, + "epoch": 3 + }, + { + "type": "pplx", + "content": 55.37722131907398, + "timestamp": "2025-09-05 09:18:22.082611", + "step": 3840, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3840", + "timestamp": "2025-09-05 09:18:22.537339", + "step": 3840, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:22.697793", + "step": 3840, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2810446321964264, + "timestamp": "2025-09-05 09:18:22.700095", + "step": 3841, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:22.893165", + "step": 3841, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.14373518526554108, + "timestamp": "2025-09-05 09:18:22.895756", + "step": 3842, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:23.098992", + "step": 3842, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3351125717163086, + "timestamp": "2025-09-05 09:18:23.101228", + "step": 3843, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:23.296487", + "step": 3843, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3412880599498749, + "timestamp": "2025-09-05 09:18:23.310519", + "step": 3844, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:23.496121", + "step": 3844, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22351853549480438, + "timestamp": "2025-09-05 09:18:23.498529", + "step": 3845, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:23.692705", + "step": 3845, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31305429339408875, + "timestamp": "2025-09-05 09:18:23.695051", + "step": 3846, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:23.860711", + "step": 3846, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3345796763896942, + "timestamp": "2025-09-05 09:18:23.862968", + "step": 3847, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:24.066541", + "step": 3847, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2449566274881363, + "timestamp": "2025-09-05 09:18:24.081763", + "step": 3848, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:24.276787", + "step": 3848, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23441752791404724, + "timestamp": "2025-09-05 09:18:24.279105", + "step": 3849, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:24.474893", + "step": 3849, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2842091917991638, + "timestamp": "2025-09-05 09:18:24.477000", + "step": 3850, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:24.673146", + "step": 3850, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4027588963508606, + "timestamp": "2025-09-05 09:18:24.675318", + "step": 3851, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:24.871283", + "step": 3851, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.36195552349090576, + "timestamp": "2025-09-05 09:18:24.885328", + "step": 3852, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:25.070758", + "step": 3852, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21417009830474854, + "timestamp": "2025-09-05 09:18:25.073293", + "step": 3853, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:25.268334", + "step": 3853, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21039460599422455, + "timestamp": "2025-09-05 09:18:25.270383", + "step": 3854, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:25.464573", + "step": 3854, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2987714409828186, + "timestamp": "2025-09-05 09:18:25.466698", + "step": 3855, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:25.628851", + "step": 3855, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3201431930065155, + "timestamp": "2025-09-05 09:18:25.645120", + "step": 3856, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:25.839423", + "step": 3856, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3816319704055786, + "timestamp": "2025-09-05 09:18:25.841608", + "step": 3857, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:26.035555", + "step": 3857, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29688769578933716, + "timestamp": "2025-09-05 09:18:26.037739", + "step": 3858, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:26.230567", + "step": 3858, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3592606484889984, + "timestamp": "2025-09-05 09:18:26.232683", + "step": 3859, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:26.426548", + "step": 3859, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3628304898738861, + "timestamp": "2025-09-05 09:18:26.440593", + "step": 3860, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:31.083837", + "step": 3860, + "epoch": 3 + }, + { + "type": "pplx", + "content": 56.521663876148416, + "timestamp": "2025-09-05 09:18:31.085627", + "step": 3860, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:31.245688", + "step": 3860, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2278020679950714, + "timestamp": "2025-09-05 09:18:31.248250", + "step": 3861, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:31.414379", + "step": 3861, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2553630471229553, + "timestamp": "2025-09-05 09:18:31.416714", + "step": 3862, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:31.622523", + "step": 3862, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31360796093940735, + "timestamp": "2025-09-05 09:18:31.625098", + "step": 3863, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:31.821089", + "step": 3863, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.23280321061611176, + "timestamp": "2025-09-05 09:18:31.835268", + "step": 3864, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:32.022303", + "step": 3864, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2609652876853943, + "timestamp": "2025-09-05 09:18:32.024305", + "step": 3865, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:32.226513", + "step": 3865, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.44476309418678284, + "timestamp": "2025-09-05 09:18:32.228929", + "step": 3866, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:32.422074", + "step": 3866, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1847713738679886, + "timestamp": "2025-09-05 09:18:32.424353", + "step": 3867, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:32.618727", + "step": 3867, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31068137288093567, + "timestamp": "2025-09-05 09:18:32.633213", + "step": 3868, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:32.820267", + "step": 3868, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2597085237503052, + "timestamp": "2025-09-05 09:18:32.822691", + "step": 3869, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:33.026080", + "step": 3869, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.27988213300704956, + "timestamp": "2025-09-05 09:18:33.028025", + "step": 3870, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:33.222332", + "step": 3870, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.28773733973503113, + "timestamp": "2025-09-05 09:18:33.224225", + "step": 3871, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:33.418058", + "step": 3871, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2838188707828522, + "timestamp": "2025-09-05 09:18:33.432683", + "step": 3872, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:33.620117", + "step": 3872, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.31004562973976135, + "timestamp": "2025-09-05 09:18:33.622110", + "step": 3873, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:33.815513", + "step": 3873, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.19978618621826172, + "timestamp": "2025-09-05 09:18:33.817791", + "step": 3874, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:34.021912", + "step": 3874, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.18819376826286316, + "timestamp": "2025-09-05 09:18:34.024432", + "step": 3875, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:34.219846", + "step": 3875, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2756434381008148, + "timestamp": "2025-09-05 09:18:34.234092", + "step": 3876, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:18:34.423758", + "step": 3876, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22656309604644775, + "timestamp": "2025-09-05 09:18:34.426614", + "step": 3877, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:34.632024", + "step": 3877, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3541288375854492, + "timestamp": "2025-09-05 09:18:34.633953", + "step": 3878, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:34.798791", + "step": 3878, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1255226582288742, + "timestamp": "2025-09-05 09:18:34.800876", + "step": 3879, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:34.964815", + "step": 3879, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.345575213432312, + "timestamp": "2025-09-05 09:18:34.981003", + "step": 3880, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:39.638308", + "step": 3880, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.5111446457821, + "timestamp": "2025-09-05 09:18:39.640566", + "step": 3880, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3880", + "timestamp": "2025-09-05 09:18:40.303411", + "step": 3880, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:40.494156", + "step": 3880, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.29227620363235474, + "timestamp": "2025-09-05 09:18:40.496215", + "step": 3881, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:40.708619", + "step": 3881, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30040618777275085, + "timestamp": "2025-09-05 09:18:40.710697", + "step": 3882, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:40.906165", + "step": 3882, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2703894376754761, + "timestamp": "2025-09-05 09:18:40.908114", + "step": 3883, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:41.104515", + "step": 3883, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.344621866941452, + "timestamp": "2025-09-05 09:18:41.120488", + "step": 3884, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:41.315270", + "step": 3884, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17300912737846375, + "timestamp": "2025-09-05 09:18:41.318283", + "step": 3885, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 224 + ], + "flops": 4480027263872.0 + }, + "timestamp": "2025-09-05 09:18:41.525061", + "step": 3885, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.4111814796924591, + "timestamp": "2025-09-05 09:18:41.527036", + "step": 3886, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:41.724233", + "step": 3886, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.32882702350616455, + "timestamp": "2025-09-05 09:18:41.726372", + "step": 3887, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 240 + ], + "flops": 4800029206464.0 + }, + "timestamp": "2025-09-05 09:18:41.932520", + "step": 3887, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.361674040555954, + "timestamp": "2025-09-05 09:18:41.941725", + "step": 3888, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:42.104101", + "step": 3888, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1684613674879074, + "timestamp": "2025-09-05 09:18:42.105741", + "step": 3889, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:42.309251", + "step": 3889, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.20353275537490845, + "timestamp": "2025-09-05 09:18:42.311380", + "step": 3890, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:42.514218", + "step": 3890, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22002647817134857, + "timestamp": "2025-09-05 09:18:42.516313", + "step": 3891, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:42.711114", + "step": 3891, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3730429708957672, + "timestamp": "2025-09-05 09:18:42.724982", + "step": 3892, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-05 09:18:42.912985", + "step": 3892, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.30563536286354065, + "timestamp": "2025-09-05 09:18:42.915080", + "step": 3893, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:43.110081", + "step": 3893, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3045671880245209, + "timestamp": "2025-09-05 09:18:43.112307", + "step": 3894, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 192 + ], + "flops": 3840023378688.0 + }, + "timestamp": "2025-09-05 09:18:43.329512", + "step": 3894, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2163662165403366, + "timestamp": "2025-09-05 09:18:43.331571", + "step": 3895, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 208 + ], + "flops": 4160025321280.0 + }, + "timestamp": "2025-09-05 09:18:43.529117", + "step": 3895, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.2787623703479767, + "timestamp": "2025-09-05 09:18:43.537962", + "step": 3896, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:43.700965", + "step": 3896, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.3941419720649719, + "timestamp": "2025-09-05 09:18:43.703207", + "step": 3897, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:43.867354", + "step": 3897, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.22157038748264313, + "timestamp": "2025-09-05 09:18:43.869374", + "step": 3898, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-05 09:18:44.035260", + "step": 3898, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.21150384843349457, + "timestamp": "2025-09-05 09:18:44.037093", + "step": 3899, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-05 09:18:44.202709", + "step": 3899, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.14150860905647278, + "timestamp": "2025-09-05 09:18:44.211577", + "step": 3900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:48.840423", + "step": 3900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.68526548554968, + "timestamp": "2025-09-05 09:18:48.842740", + "step": 3900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 208 + ], + "batch_size": 8, + "flops": 4151977605760 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 192 + ], + "batch_size": 8, + "flops": 3832594718208 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 128 + ], + "batch_size": 8, + "flops": 2555063168000 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 144 + ], + "batch_size": 8, + "flops": 2874446055552 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 176 + ], + "batch_size": 8, + "flops": 3513211830656 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 160 + ], + "batch_size": 8, + "flops": 3193828943104 + } + ], + "timestamp": "2025-09-05 09:18:53.413113", + "step": 3900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 57.68526548554968, + "timestamp": "2025-09-05 09:18:53.415162", + "step": 3900, + "epoch": 3 + }, + { + "type": "best_pplx", + "content": 52.40073254912315, + "timestamp": "2025-09-05 09:18:53.416944", + "step": 3900, + "epoch": 3 + }, + { + "type": "best_step", + "content": 3040, + "timestamp": "2025-09-05 09:18:53.418308", + "step": 3900, + "epoch": 3 + }, + { + "type": "total_pplx_flops", + "content": 49705559881469696, + "timestamp": "2025-09-05 09:18:53.420522", + "step": 3900, + "epoch": 3 + }, + { + "type": "total_train_flops", + "content": 1.368968336766336e+16, + "timestamp": "2025-09-05 09:18:53.781419", + "step": 3900, + "epoch": 3 + } + ], + "best_evals": { + "pplx": { + "score": 52.40073254912315, + "step": 3040 + }, + "rouge1": { + "precision": 0.30901464044392746, + "recall": 0.2632675026888113, + "fmeasure": 0.2558464239565212 + } + } +} \ No newline at end of file