diff --git "a/experiment_config.json" "b/experiment_config.json" new file mode 100644--- /dev/null +++ "b/experiment_config.json" @@ -0,0 +1,81748 @@ +{ + "training_args": { + "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": true, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 8, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 5e-05, + "weight_decay": 0.0, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3, + "max_steps": -1, + "lr_scheduler_type": "linear", + "lr_scheduler_kwargs": {}, + "warmup_ratio": 0.0, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1/runs/Sep30_22-09-37_gx10", + "logging_strategy": "steps", + "logging_first_step": false, + "logging_steps": 20, + "logging_nan_inf_filter": true, + "save_strategy": "epoch", + "save_steps": 500, + "save_total_limit": null, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "seed": 42, + "data_seed": null, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": false, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": [], + "dataloader_drop_last": false, + "eval_steps": 57, + "dataloader_num_workers": 0, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1", + "disable_tqdm": false, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": null, + "greater_is_better": null, + "ignore_data_skip": false, + "fsdp": [], + "fsdp_min_num_params": 0, + "fsdp_config": { + "min_num_params": 0, + "xla": false, + "xla_fsdp_v2": false, + "xla_fsdp_grad_ckpt": false + }, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "split_batches": false, + "dispatch_batches": null, + "even_batches": true, + "use_seedable_sampler": true, + "non_blocking": false, + "gradient_accumulation_kwargs": null + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_token": "", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": false, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": "", + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false + }, + "lora_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "revision": null, + "inference_mode": false, + "r": 16, + "target_modules": [ + "o_proj", + "k_proj", + "v_proj", + "gate_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 16, + "lora_dropout": 0.1, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": true, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "alora_invocation_tokens": null, + "use_qalora": false, + "qalora_group_size": 16, + "layer_replication": null, + "runtime_config": { + "ephemeral_gpu_offload": false + }, + "lora_bias": false, + "target_parameters": null, + "arrow_config": null + }, + "flops": { + "eval": 5062218940038400, + "train": 7174123736893632.0, + "total": 1.2236342676932032e+16 + }, + "total": { + "total": 67344.37337, + "train": 48675.5105, + "eval": 18668.862870000004 + }, + "logs": [ + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:09:45.720522", + "step": 0, + "epoch": 0 + }, + { + "type": "pplx", + "content": 226674977.87649825, + "timestamp": "2025-09-30 22:09:45.725910", + "step": 0, + "epoch": 0 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:45.829654", + "step": 0, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7057779431343079, + "timestamp": "2025-09-30 22:09:45.833808", + "step": 1, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:45.927323", + "step": 1, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.6982383131980896, + "timestamp": "2025-09-30 22:09:45.931697", + "step": 2, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:45.989452", + "step": 2, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7418850064277649, + "timestamp": "2025-09-30 22:09:46.001612", + "step": 3, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.069644", + "step": 3, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.7169809341430664, + "timestamp": "2025-09-30 22:09:46.122890", + "step": 4, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.194754", + "step": 4, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5053693652153015, + "timestamp": "2025-09-30 22:09:46.207488", + "step": 5, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.291138", + "step": 5, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5166937708854675, + "timestamp": "2025-09-30 22:09:46.303245", + "step": 6, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:46.381366", + "step": 6, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.5014775991439819, + "timestamp": "2025-09-30 22:09:46.385931", + "step": 7, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.454635", + "step": 7, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.514745831489563, + "timestamp": "2025-09-30 22:09:46.469695", + "step": 8, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.539001", + "step": 8, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3292957842350006, + "timestamp": "2025-09-30 22:09:46.546966", + "step": 9, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.612140", + "step": 9, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3126233220100403, + "timestamp": "2025-09-30 22:09:46.617552", + "step": 10, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.675520", + "step": 10, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3234768509864807, + "timestamp": "2025-09-30 22:09:46.690861", + "step": 11, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.756152", + "step": 11, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3332277834415436, + "timestamp": "2025-09-30 22:09:46.765624", + "step": 12, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.823098", + "step": 12, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.16887438297271729, + "timestamp": "2025-09-30 22:09:46.828694", + "step": 13, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.891179", + "step": 13, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15498413145542145, + "timestamp": "2025-09-30 22:09:46.894434", + "step": 14, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:46.961785", + "step": 14, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15807364881038666, + "timestamp": "2025-09-30 22:09:46.965388", + "step": 15, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.021211", + "step": 15, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.16345378756523132, + "timestamp": "2025-09-30 22:09:47.037888", + "step": 16, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.092321", + "step": 16, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06315434724092484, + "timestamp": "2025-09-30 22:09:47.095399", + "step": 17, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.155095", + "step": 17, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07140235602855682, + "timestamp": "2025-09-30 22:09:47.159326", + "step": 18, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:47.217343", + "step": 18, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07542860507965088, + "timestamp": "2025-09-30 22:09:47.231838", + "step": 19, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.297426", + "step": 19, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07850368320941925, + "timestamp": "2025-09-30 22:09:47.314640", + "step": 20, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:47.380582", + "step": 20, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05402212589979172, + "timestamp": "2025-09-30 22:09:47.393830", + "step": 21, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.463154", + "step": 21, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04844330623745918, + "timestamp": "2025-09-30 22:09:47.477962", + "step": 22, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.538665", + "step": 22, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04047473892569542, + "timestamp": "2025-09-30 22:09:47.542223", + "step": 23, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.599055", + "step": 23, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03956778720021248, + "timestamp": "2025-09-30 22:09:47.605916", + "step": 24, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.666928", + "step": 24, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03282862529158592, + "timestamp": "2025-09-30 22:09:47.680006", + "step": 25, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:47.747238", + "step": 25, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047466158866882324, + "timestamp": "2025-09-30 22:09:47.751128", + "step": 26, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:47.809803", + "step": 26, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02898944355547428, + "timestamp": "2025-09-30 22:09:47.813747", + "step": 27, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.883244", + "step": 27, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02849601022899151, + "timestamp": "2025-09-30 22:09:47.892740", + "step": 28, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:47.948731", + "step": 28, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04645688086748123, + "timestamp": "2025-09-30 22:09:47.957611", + "step": 29, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.014118", + "step": 29, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02362995222210884, + "timestamp": "2025-09-30 22:09:48.017619", + "step": 30, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.092874", + "step": 30, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02474798448383808, + "timestamp": "2025-09-30 22:09:48.098478", + "step": 31, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.158112", + "step": 31, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025140417739748955, + "timestamp": "2025-09-30 22:09:48.165508", + "step": 32, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.235509", + "step": 32, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004965892527252436, + "timestamp": "2025-09-30 22:09:48.249272", + "step": 33, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.306709", + "step": 33, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03571665287017822, + "timestamp": "2025-09-30 22:09:48.321337", + "step": 34, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.396615", + "step": 34, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005427081603556871, + "timestamp": "2025-09-30 22:09:48.400758", + "step": 35, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.463184", + "step": 35, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022603018209338188, + "timestamp": "2025-09-30 22:09:48.470249", + "step": 36, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.526578", + "step": 36, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017443154007196426, + "timestamp": "2025-09-30 22:09:48.530394", + "step": 37, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.586995", + "step": 37, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023570295423269272, + "timestamp": "2025-09-30 22:09:48.600109", + "step": 38, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:48.662782", + "step": 38, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008299198001623154, + "timestamp": "2025-09-30 22:09:48.666655", + "step": 39, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.724407", + "step": 39, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023102333769202232, + "timestamp": "2025-09-30 22:09:48.740457", + "step": 40, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.806911", + "step": 40, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006827209610491991, + "timestamp": "2025-09-30 22:09:48.810978", + "step": 41, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:48.869752", + "step": 41, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02204090915620327, + "timestamp": "2025-09-30 22:09:48.876097", + "step": 42, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:48.935298", + "step": 42, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036014724522829056, + "timestamp": "2025-09-30 22:09:48.938917", + "step": 43, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:49.019588", + "step": 43, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005299022886902094, + "timestamp": "2025-09-30 22:09:49.030945", + "step": 44, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.092096", + "step": 44, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007433965802192688, + "timestamp": "2025-09-30 22:09:49.095682", + "step": 45, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.153079", + "step": 45, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007419214118272066, + "timestamp": "2025-09-30 22:09:49.156971", + "step": 46, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.226418", + "step": 46, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03738881275057793, + "timestamp": "2025-09-30 22:09:49.231831", + "step": 47, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.296369", + "step": 47, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021278386935591698, + "timestamp": "2025-09-30 22:09:49.308245", + "step": 48, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.369779", + "step": 48, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008051230572164059, + "timestamp": "2025-09-30 22:09:49.382874", + "step": 49, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:49.448881", + "step": 49, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03269955888390541, + "timestamp": "2025-09-30 22:09:49.455582", + "step": 50, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.521657", + "step": 50, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009572034701704979, + "timestamp": "2025-09-30 22:09:49.535895", + "step": 51, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.607418", + "step": 51, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03262675181031227, + "timestamp": "2025-09-30 22:09:49.618732", + "step": 52, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:49.698096", + "step": 52, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031087083742022514, + "timestamp": "2025-09-30 22:09:49.702960", + "step": 53, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:49.774471", + "step": 53, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028552474454045296, + "timestamp": "2025-09-30 22:09:49.790307", + "step": 54, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:49.858357", + "step": 54, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021799175068736076, + "timestamp": "2025-09-30 22:09:49.866314", + "step": 55, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:49.946732", + "step": 55, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03151797503232956, + "timestamp": "2025-09-30 22:09:49.956360", + "step": 56, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:50.025091", + "step": 56, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02139838971197605, + "timestamp": "2025-09-30 22:09:50.036752", + "step": 57, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:09:51.671705", + "step": 57, + "epoch": 1 + }, + { + "type": "pplx", + "content": 33459661.644647755, + "timestamp": "2025-09-30 22:09:51.677492", + "step": 57, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:51.732715", + "step": 57, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020825445652008057, + "timestamp": "2025-09-30 22:09:51.736354", + "step": 58, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:51.799744", + "step": 58, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019294634461402893, + "timestamp": "2025-09-30 22:09:51.804206", + "step": 59, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:51.863673", + "step": 59, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020074540749192238, + "timestamp": "2025-09-30 22:09:51.879375", + "step": 60, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:51.942712", + "step": 60, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019697507843375206, + "timestamp": "2025-09-30 22:09:51.946783", + "step": 61, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:52.013178", + "step": 61, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02132870815694332, + "timestamp": "2025-09-30 22:09:52.017164", + "step": 62, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:52.092840", + "step": 62, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01894804835319519, + "timestamp": "2025-09-30 22:09:52.095762", + "step": 63, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.174530", + "step": 63, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016838889569044113, + "timestamp": "2025-09-30 22:09:52.182844", + "step": 64, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.261938", + "step": 64, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01592377945780754, + "timestamp": "2025-09-30 22:09:52.271881", + "step": 65, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:52.339697", + "step": 65, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02463250793516636, + "timestamp": "2025-09-30 22:09:52.343673", + "step": 66, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.413263", + "step": 66, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018470000475645065, + "timestamp": "2025-09-30 22:09:52.424819", + "step": 67, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.493307", + "step": 67, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022894982248544693, + "timestamp": "2025-09-30 22:09:52.507145", + "step": 68, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:52.577240", + "step": 68, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029143383726477623, + "timestamp": "2025-09-30 22:09:52.581726", + "step": 69, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.649330", + "step": 69, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020210010930895805, + "timestamp": "2025-09-30 22:09:52.652808", + "step": 70, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.721839", + "step": 70, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011494003236293793, + "timestamp": "2025-09-30 22:09:52.726534", + "step": 71, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.789538", + "step": 71, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028703901916742325, + "timestamp": "2025-09-30 22:09:52.796646", + "step": 72, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:52.854779", + "step": 72, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018709659576416016, + "timestamp": "2025-09-30 22:09:52.857531", + "step": 73, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.929717", + "step": 73, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029944155365228653, + "timestamp": "2025-09-30 22:09:52.933498", + "step": 74, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:52.993304", + "step": 74, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020325303077697754, + "timestamp": "2025-09-30 22:09:52.997865", + "step": 75, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:53.069769", + "step": 75, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02151350863277912, + "timestamp": "2025-09-30 22:09:53.078068", + "step": 76, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.137966", + "step": 76, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0210565198212862, + "timestamp": "2025-09-30 22:09:53.142397", + "step": 77, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.203236", + "step": 77, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03604406490921974, + "timestamp": "2025-09-30 22:09:53.217868", + "step": 78, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.277099", + "step": 78, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036332327872514725, + "timestamp": "2025-09-30 22:09:53.281235", + "step": 79, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:53.342746", + "step": 79, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028764527291059494, + "timestamp": "2025-09-30 22:09:53.351222", + "step": 80, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.420316", + "step": 80, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019019123166799545, + "timestamp": "2025-09-30 22:09:53.424641", + "step": 81, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.480453", + "step": 81, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02148059569299221, + "timestamp": "2025-09-30 22:09:53.486176", + "step": 82, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.545094", + "step": 82, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04821930453181267, + "timestamp": "2025-09-30 22:09:53.549757", + "step": 83, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.621024", + "step": 83, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02800285816192627, + "timestamp": "2025-09-30 22:09:53.628236", + "step": 84, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.694859", + "step": 84, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0229884572327137, + "timestamp": "2025-09-30 22:09:53.702009", + "step": 85, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.769420", + "step": 85, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014066072180867195, + "timestamp": "2025-09-30 22:09:53.773399", + "step": 86, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.840322", + "step": 86, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017103631049394608, + "timestamp": "2025-09-30 22:09:53.843455", + "step": 87, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.903605", + "step": 87, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020594418048858643, + "timestamp": "2025-09-30 22:09:53.919037", + "step": 88, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:53.986826", + "step": 88, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020471200346946716, + "timestamp": "2025-09-30 22:09:53.991020", + "step": 89, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:54.054065", + "step": 89, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023065567016601562, + "timestamp": "2025-09-30 22:09:54.059760", + "step": 90, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.134584", + "step": 90, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026240045204758644, + "timestamp": "2025-09-30 22:09:54.147631", + "step": 91, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:54.215981", + "step": 91, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024411886930465698, + "timestamp": "2025-09-30 22:09:54.233867", + "step": 92, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.308215", + "step": 92, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022564176470041275, + "timestamp": "2025-09-30 22:09:54.313481", + "step": 93, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.376766", + "step": 93, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02625674568116665, + "timestamp": "2025-09-30 22:09:54.381908", + "step": 94, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.442608", + "step": 94, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026146892458200455, + "timestamp": "2025-09-30 22:09:54.448685", + "step": 95, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.517320", + "step": 95, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023254042491316795, + "timestamp": "2025-09-30 22:09:54.525273", + "step": 96, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.582927", + "step": 96, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021697642281651497, + "timestamp": "2025-09-30 22:09:54.587639", + "step": 97, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.648320", + "step": 97, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020044708624482155, + "timestamp": "2025-09-30 22:09:54.660964", + "step": 98, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.731315", + "step": 98, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019530335441231728, + "timestamp": "2025-09-30 22:09:54.734833", + "step": 99, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.794372", + "step": 99, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02076215110719204, + "timestamp": "2025-09-30 22:09:54.803435", + "step": 100, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.864805", + "step": 100, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018885493278503418, + "timestamp": "2025-09-30 22:09:54.869711", + "step": 101, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.928484", + "step": 101, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015139119699597359, + "timestamp": "2025-09-30 22:09:54.932241", + "step": 102, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:54.991083", + "step": 102, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020732766017317772, + "timestamp": "2025-09-30 22:09:54.994243", + "step": 103, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.058758", + "step": 103, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031727004796266556, + "timestamp": "2025-09-30 22:09:55.066276", + "step": 104, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.123302", + "step": 104, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03581748530268669, + "timestamp": "2025-09-30 22:09:55.132399", + "step": 105, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.190214", + "step": 105, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0061849248595535755, + "timestamp": "2025-09-30 22:09:55.193819", + "step": 106, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:55.257208", + "step": 106, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02108680084347725, + "timestamp": "2025-09-30 22:09:55.263242", + "step": 107, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:55.333196", + "step": 107, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.054573871195316315, + "timestamp": "2025-09-30 22:09:55.352127", + "step": 108, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.419704", + "step": 108, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056397486478090286, + "timestamp": "2025-09-30 22:09:55.423709", + "step": 109, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:09:55.503381", + "step": 109, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045181386172771454, + "timestamp": "2025-09-30 22:09:55.507646", + "step": 110, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.565463", + "step": 110, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006864451337605715, + "timestamp": "2025-09-30 22:09:55.570595", + "step": 111, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.629518", + "step": 111, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035181932151317596, + "timestamp": "2025-09-30 22:09:55.638791", + "step": 112, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:55.698419", + "step": 112, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02644294500350952, + "timestamp": "2025-09-30 22:09:55.702122", + "step": 113, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:55.762278", + "step": 113, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024839377030730247, + "timestamp": "2025-09-30 22:09:55.767722", + "step": 114, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:09:57.404156", + "step": 114, + "epoch": 1 + }, + { + "type": "pplx", + "content": 34367165.57846854, + "timestamp": "2025-09-30 22:09:57.417807", + "step": 114, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:57.476760", + "step": 114, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009317861869931221, + "timestamp": "2025-09-30 22:09:57.481123", + "step": 115, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:57.549026", + "step": 115, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03304336592555046, + "timestamp": "2025-09-30 22:09:57.556362", + "step": 116, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:57.621741", + "step": 116, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02041228488087654, + "timestamp": "2025-09-30 22:09:57.625489", + "step": 117, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:57.707532", + "step": 117, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020021427422761917, + "timestamp": "2025-09-30 22:09:57.711221", + "step": 118, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:57.770352", + "step": 118, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01669391058385372, + "timestamp": "2025-09-30 22:09:57.774354", + "step": 119, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:57.833648", + "step": 119, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019399311393499374, + "timestamp": "2025-09-30 22:09:57.840377", + "step": 120, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:57.902145", + "step": 120, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027869191020727158, + "timestamp": "2025-09-30 22:09:57.905157", + "step": 121, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:57.963613", + "step": 121, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026405667886137962, + "timestamp": "2025-09-30 22:09:57.967225", + "step": 122, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:58.036235", + "step": 122, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025716735050082207, + "timestamp": "2025-09-30 22:09:58.040067", + "step": 123, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.128759", + "step": 123, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021928545087575912, + "timestamp": "2025-09-30 22:09:58.136284", + "step": 124, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.194040", + "step": 124, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024018412455916405, + "timestamp": "2025-09-30 22:09:58.203869", + "step": 125, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.275576", + "step": 125, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024753393605351448, + "timestamp": "2025-09-30 22:09:58.279054", + "step": 126, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:58.335015", + "step": 126, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025028575211763382, + "timestamp": "2025-09-30 22:09:58.339229", + "step": 127, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:58.409024", + "step": 127, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022491442039608955, + "timestamp": "2025-09-30 22:09:58.415306", + "step": 128, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.472633", + "step": 128, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021387087181210518, + "timestamp": "2025-09-30 22:09:58.475769", + "step": 129, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.544050", + "step": 129, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030490174889564514, + "timestamp": "2025-09-30 22:09:58.547113", + "step": 130, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.608056", + "step": 130, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021837439388036728, + "timestamp": "2025-09-30 22:09:58.611804", + "step": 131, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:58.672124", + "step": 131, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022853758186101913, + "timestamp": "2025-09-30 22:09:58.679715", + "step": 132, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:58.743010", + "step": 132, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02393500693142414, + "timestamp": "2025-09-30 22:09:58.747907", + "step": 133, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.812859", + "step": 133, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024709191173315048, + "timestamp": "2025-09-30 22:09:58.817248", + "step": 134, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:58.874630", + "step": 134, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02340940572321415, + "timestamp": "2025-09-30 22:09:58.878363", + "step": 135, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:58.955588", + "step": 135, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021506870165467262, + "timestamp": "2025-09-30 22:09:58.971620", + "step": 136, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:09:59.036619", + "step": 136, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01850222609937191, + "timestamp": "2025-09-30 22:09:59.041465", + "step": 137, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.114440", + "step": 137, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025246715173125267, + "timestamp": "2025-09-30 22:09:59.119401", + "step": 138, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.185394", + "step": 138, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019027644768357277, + "timestamp": "2025-09-30 22:09:59.188937", + "step": 139, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.254993", + "step": 139, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023021994158625603, + "timestamp": "2025-09-30 22:09:59.268285", + "step": 140, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.333791", + "step": 140, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018915778025984764, + "timestamp": "2025-09-30 22:09:59.338389", + "step": 141, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.397001", + "step": 141, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02081509307026863, + "timestamp": "2025-09-30 22:09:59.400430", + "step": 142, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.475598", + "step": 142, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02820817194879055, + "timestamp": "2025-09-30 22:09:59.478597", + "step": 143, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.539292", + "step": 143, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01714860461652279, + "timestamp": "2025-09-30 22:09:59.545917", + "step": 144, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.613640", + "step": 144, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01967170275747776, + "timestamp": "2025-09-30 22:09:59.626738", + "step": 145, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.695249", + "step": 145, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015452763997018337, + "timestamp": "2025-09-30 22:09:59.698910", + "step": 146, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.758042", + "step": 146, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031086495146155357, + "timestamp": "2025-09-30 22:09:59.761287", + "step": 147, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.832230", + "step": 147, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035771798342466354, + "timestamp": "2025-09-30 22:09:59.838403", + "step": 148, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:09:59.896383", + "step": 148, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.052823007106781006, + "timestamp": "2025-09-30 22:09:59.904646", + "step": 149, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:09:59.974528", + "step": 149, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0519816055893898, + "timestamp": "2025-09-30 22:09:59.979756", + "step": 150, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.039600", + "step": 150, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020907681435346603, + "timestamp": "2025-09-30 22:10:00.044125", + "step": 151, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.110151", + "step": 151, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005571091081947088, + "timestamp": "2025-09-30 22:10:00.122291", + "step": 152, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:00.190990", + "step": 152, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006688456982374191, + "timestamp": "2025-09-30 22:10:00.193396", + "step": 153, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:00.257532", + "step": 153, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02993638627231121, + "timestamp": "2025-09-30 22:10:00.260041", + "step": 154, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.326910", + "step": 154, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03653806075453758, + "timestamp": "2025-09-30 22:10:00.336903", + "step": 155, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.399218", + "step": 155, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02178972400724888, + "timestamp": "2025-09-30 22:10:00.406459", + "step": 156, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.471221", + "step": 156, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03773200884461403, + "timestamp": "2025-09-30 22:10:00.481225", + "step": 157, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.544734", + "step": 157, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06024942919611931, + "timestamp": "2025-09-30 22:10:00.555024", + "step": 158, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:00.626258", + "step": 158, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.051464516669511795, + "timestamp": "2025-09-30 22:10:00.629304", + "step": 159, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.695658", + "step": 159, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03535311296582222, + "timestamp": "2025-09-30 22:10:00.705942", + "step": 160, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.770799", + "step": 160, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021262094378471375, + "timestamp": "2025-09-30 22:10:00.773120", + "step": 161, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.836550", + "step": 161, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02948129177093506, + "timestamp": "2025-09-30 22:10:00.840114", + "step": 162, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:00.903882", + "step": 162, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020708870142698288, + "timestamp": "2025-09-30 22:10:00.918926", + "step": 163, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:00.987530", + "step": 163, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01856974884867668, + "timestamp": "2025-09-30 22:10:01.003210", + "step": 164, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:01.086270", + "step": 164, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02260742522776127, + "timestamp": "2025-09-30 22:10:01.089350", + "step": 165, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:01.159681", + "step": 165, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024135034531354904, + "timestamp": "2025-09-30 22:10:01.167552", + "step": 166, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:01.253829", + "step": 166, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024836549535393715, + "timestamp": "2025-09-30 22:10:01.257725", + "step": 167, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:01.317725", + "step": 167, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021925508975982666, + "timestamp": "2025-09-30 22:10:01.324948", + "step": 168, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:01.383629", + "step": 168, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026183176785707474, + "timestamp": "2025-09-30 22:10:01.386896", + "step": 169, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:01.445667", + "step": 169, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027825552970170975, + "timestamp": "2025-09-30 22:10:01.448737", + "step": 170, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:01.514985", + "step": 170, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02336815930902958, + "timestamp": "2025-09-30 22:10:01.524465", + "step": 171, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:03.122432", + "step": 171, + "epoch": 1 + }, + { + "type": "pplx", + "content": 30444627.67679443, + "timestamp": "2025-09-30 22:10:03.130412", + "step": 171, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:03.189713", + "step": 171, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025213051587343216, + "timestamp": "2025-09-30 22:10:03.207338", + "step": 172, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:03.270568", + "step": 172, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026924652978777885, + "timestamp": "2025-09-30 22:10:03.289893", + "step": 173, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:03.358458", + "step": 173, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019906342029571533, + "timestamp": "2025-09-30 22:10:03.367731", + "step": 174, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:03.431420", + "step": 174, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02854965254664421, + "timestamp": "2025-09-30 22:10:03.435169", + "step": 175, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:03.502168", + "step": 175, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029346134513616562, + "timestamp": "2025-09-30 22:10:03.513567", + "step": 176, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:03.569994", + "step": 176, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027899976819753647, + "timestamp": "2025-09-30 22:10:03.595350", + "step": 177, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:03.660257", + "step": 177, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031159192323684692, + "timestamp": "2025-09-30 22:10:03.664061", + "step": 178, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:03.758193", + "step": 178, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025665393099188805, + "timestamp": "2025-09-30 22:10:03.762946", + "step": 179, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:03.828469", + "step": 179, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02343871258199215, + "timestamp": "2025-09-30 22:10:03.862207", + "step": 180, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:03.934534", + "step": 180, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023453569039702415, + "timestamp": "2025-09-30 22:10:03.941948", + "step": 181, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.015301", + "step": 181, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025829827412962914, + "timestamp": "2025-09-30 22:10:04.020573", + "step": 182, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.091297", + "step": 182, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029110519215464592, + "timestamp": "2025-09-30 22:10:04.102375", + "step": 183, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:04.171598", + "step": 183, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02779129333794117, + "timestamp": "2025-09-30 22:10:04.182335", + "step": 184, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:04.244865", + "step": 184, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026309384033083916, + "timestamp": "2025-09-30 22:10:04.248569", + "step": 185, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.317008", + "step": 185, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020971303805708885, + "timestamp": "2025-09-30 22:10:04.319635", + "step": 186, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.389723", + "step": 186, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026080194860696793, + "timestamp": "2025-09-30 22:10:04.398670", + "step": 187, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.462204", + "step": 187, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022094666957855225, + "timestamp": "2025-09-30 22:10:04.474646", + "step": 188, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:04.546212", + "step": 188, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019831160083413124, + "timestamp": "2025-09-30 22:10:04.548835", + "step": 189, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.611573", + "step": 189, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02252844348549843, + "timestamp": "2025-09-30 22:10:04.615230", + "step": 190, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:04.698424", + "step": 190, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021682869642972946, + "timestamp": "2025-09-30 22:10:04.700932", + "step": 191, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:04.769139", + "step": 191, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02150973677635193, + "timestamp": "2025-09-30 22:10:04.775344", + "step": 192, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:04.834112", + "step": 192, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014492852613329887, + "timestamp": "2025-09-30 22:10:04.843621", + "step": 193, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:04.918930", + "step": 193, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015967510640621185, + "timestamp": "2025-09-30 22:10:04.923925", + "step": 194, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:04.980759", + "step": 194, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01770109124481678, + "timestamp": "2025-09-30 22:10:04.994927", + "step": 195, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.070096", + "step": 195, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02010829746723175, + "timestamp": "2025-09-30 22:10:05.080898", + "step": 196, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:05.151418", + "step": 196, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010940426029264927, + "timestamp": "2025-09-30 22:10:05.165479", + "step": 197, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:05.227588", + "step": 197, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02102290280163288, + "timestamp": "2025-09-30 22:10:05.233173", + "step": 198, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:05.295898", + "step": 198, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038564298301935196, + "timestamp": "2025-09-30 22:10:05.299015", + "step": 199, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.358240", + "step": 199, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030264217406511307, + "timestamp": "2025-09-30 22:10:05.372402", + "step": 200, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:05.436584", + "step": 200, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019798846915364265, + "timestamp": "2025-09-30 22:10:05.440647", + "step": 201, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.511249", + "step": 201, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02034393884241581, + "timestamp": "2025-09-30 22:10:05.522972", + "step": 202, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.588730", + "step": 202, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019814888015389442, + "timestamp": "2025-09-30 22:10:05.599717", + "step": 203, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:05.665164", + "step": 203, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01977935992181301, + "timestamp": "2025-09-30 22:10:05.671120", + "step": 204, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:05.733763", + "step": 204, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029726387932896614, + "timestamp": "2025-09-30 22:10:05.736828", + "step": 205, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.793001", + "step": 205, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008051145821809769, + "timestamp": "2025-09-30 22:10:05.796198", + "step": 206, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.855235", + "step": 206, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03421095758676529, + "timestamp": "2025-09-30 22:10:05.859122", + "step": 207, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.918769", + "step": 207, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01968538574874401, + "timestamp": "2025-09-30 22:10:05.925875", + "step": 208, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:05.982794", + "step": 208, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03160746023058891, + "timestamp": "2025-09-30 22:10:05.988337", + "step": 209, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.054850", + "step": 209, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04332786053419113, + "timestamp": "2025-09-30 22:10:06.062819", + "step": 210, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.139111", + "step": 210, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020400291308760643, + "timestamp": "2025-09-30 22:10:06.146696", + "step": 211, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.214028", + "step": 211, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05565622076392174, + "timestamp": "2025-09-30 22:10:06.223281", + "step": 212, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.280632", + "step": 212, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03217899426817894, + "timestamp": "2025-09-30 22:10:06.287053", + "step": 213, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.350273", + "step": 213, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01874413527548313, + "timestamp": "2025-09-30 22:10:06.356717", + "step": 214, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.422063", + "step": 214, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037103354930877686, + "timestamp": "2025-09-30 22:10:06.429637", + "step": 215, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.493219", + "step": 215, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030940750613808632, + "timestamp": "2025-09-30 22:10:06.505199", + "step": 216, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:06.578758", + "step": 216, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0197443924844265, + "timestamp": "2025-09-30 22:10:06.585913", + "step": 217, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:06.658187", + "step": 217, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020792236551642418, + "timestamp": "2025-09-30 22:10:06.660699", + "step": 218, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.717629", + "step": 218, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02086322009563446, + "timestamp": "2025-09-30 22:10:06.720244", + "step": 219, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.795016", + "step": 219, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01735932007431984, + "timestamp": "2025-09-30 22:10:06.801454", + "step": 220, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.859512", + "step": 220, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023645946756005287, + "timestamp": "2025-09-30 22:10:06.862985", + "step": 221, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.923013", + "step": 221, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026022594422101974, + "timestamp": "2025-09-30 22:10:06.926308", + "step": 222, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:06.984919", + "step": 222, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02121904492378235, + "timestamp": "2025-09-30 22:10:06.991504", + "step": 223, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:07.066529", + "step": 223, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027569929137825966, + "timestamp": "2025-09-30 22:10:07.073649", + "step": 224, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:07.131861", + "step": 224, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022572649642825127, + "timestamp": "2025-09-30 22:10:07.139150", + "step": 225, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:07.196403", + "step": 225, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02417564019560814, + "timestamp": "2025-09-30 22:10:07.198523", + "step": 226, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:07.255106", + "step": 226, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027327995747327805, + "timestamp": "2025-09-30 22:10:07.260826", + "step": 227, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:07.335925", + "step": 227, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018605349585413933, + "timestamp": "2025-09-30 22:10:07.346966", + "step": 228, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:08.896406", + "step": 228, + "epoch": 1 + }, + { + "type": "pplx", + "content": 30884345.685848907, + "timestamp": "2025-09-30 22:10:08.897993", + "step": 228, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:08.949580", + "step": 228, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021486392244696617, + "timestamp": "2025-09-30 22:10:08.952193", + "step": 229, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.007817", + "step": 229, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02346794866025448, + "timestamp": "2025-09-30 22:10:09.010228", + "step": 230, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.066408", + "step": 230, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022495094686746597, + "timestamp": "2025-09-30 22:10:09.068495", + "step": 231, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.130931", + "step": 231, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025627722963690758, + "timestamp": "2025-09-30 22:10:09.136540", + "step": 232, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:09.192596", + "step": 232, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025128453969955444, + "timestamp": "2025-09-30 22:10:09.194667", + "step": 233, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.257595", + "step": 233, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021439703181385994, + "timestamp": "2025-09-30 22:10:09.262908", + "step": 234, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.320130", + "step": 234, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025162285193800926, + "timestamp": "2025-09-30 22:10:09.322981", + "step": 235, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.379439", + "step": 235, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026900988072156906, + "timestamp": "2025-09-30 22:10:09.386142", + "step": 236, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:09.441059", + "step": 236, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024046484380960464, + "timestamp": "2025-09-30 22:10:09.445390", + "step": 237, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.516816", + "step": 237, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023616518825292587, + "timestamp": "2025-09-30 22:10:09.520114", + "step": 238, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.578308", + "step": 238, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025127580389380455, + "timestamp": "2025-09-30 22:10:09.581575", + "step": 239, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.651946", + "step": 239, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02287052944302559, + "timestamp": "2025-09-30 22:10:09.659087", + "step": 240, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:09.721058", + "step": 240, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022945618256926537, + "timestamp": "2025-09-30 22:10:09.723823", + "step": 241, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.785061", + "step": 241, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023390578106045723, + "timestamp": "2025-09-30 22:10:09.791062", + "step": 242, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:09.854408", + "step": 242, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022397786378860474, + "timestamp": "2025-09-30 22:10:09.858275", + "step": 243, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:09.915704", + "step": 243, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021386751905083656, + "timestamp": "2025-09-30 22:10:09.923545", + "step": 244, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.001700", + "step": 244, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02710045874118805, + "timestamp": "2025-09-30 22:10:10.004521", + "step": 245, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.076145", + "step": 245, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026470202952623367, + "timestamp": "2025-09-30 22:10:10.083954", + "step": 246, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:10.146486", + "step": 246, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025309713557362556, + "timestamp": "2025-09-30 22:10:10.149331", + "step": 247, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.207779", + "step": 247, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023809578269720078, + "timestamp": "2025-09-30 22:10:10.218339", + "step": 248, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.277901", + "step": 248, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028678497299551964, + "timestamp": "2025-09-30 22:10:10.286335", + "step": 249, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:10.367509", + "step": 249, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019650932401418686, + "timestamp": "2025-09-30 22:10:10.370450", + "step": 250, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.429746", + "step": 250, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025458762422204018, + "timestamp": "2025-09-30 22:10:10.438017", + "step": 251, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.495845", + "step": 251, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017796490341424942, + "timestamp": "2025-09-30 22:10:10.505651", + "step": 252, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:10.567499", + "step": 252, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015613814815878868, + "timestamp": "2025-09-30 22:10:10.575269", + "step": 253, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:10.640852", + "step": 253, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026363762095570564, + "timestamp": "2025-09-30 22:10:10.648408", + "step": 254, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:10.708360", + "step": 254, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018938321620225906, + "timestamp": "2025-09-30 22:10:10.719034", + "step": 255, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:10.779213", + "step": 255, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019102880731225014, + "timestamp": "2025-09-30 22:10:10.792170", + "step": 256, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.853523", + "step": 256, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015843430534005165, + "timestamp": "2025-09-30 22:10:10.861130", + "step": 257, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.919829", + "step": 257, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033805202692747116, + "timestamp": "2025-09-30 22:10:10.930292", + "step": 258, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:10.995113", + "step": 258, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024396615102887154, + "timestamp": "2025-09-30 22:10:10.998228", + "step": 259, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:11.058884", + "step": 259, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020021233707666397, + "timestamp": "2025-09-30 22:10:11.070816", + "step": 260, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.140519", + "step": 260, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02062203176319599, + "timestamp": "2025-09-30 22:10:11.146244", + "step": 261, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.216345", + "step": 261, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021955931559205055, + "timestamp": "2025-09-30 22:10:11.218795", + "step": 262, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:11.277173", + "step": 262, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021168585866689682, + "timestamp": "2025-09-30 22:10:11.280607", + "step": 263, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:11.337964", + "step": 263, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014190520159900188, + "timestamp": "2025-09-30 22:10:11.344624", + "step": 264, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.404989", + "step": 264, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021121378988027573, + "timestamp": "2025-09-30 22:10:11.408576", + "step": 265, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.466890", + "step": 265, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019114112481474876, + "timestamp": "2025-09-30 22:10:11.470548", + "step": 266, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:11.535659", + "step": 266, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025738313794136047, + "timestamp": "2025-09-30 22:10:11.539012", + "step": 267, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:11.606169", + "step": 267, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03802191838622093, + "timestamp": "2025-09-30 22:10:11.612777", + "step": 268, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:11.672111", + "step": 268, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018177002668380737, + "timestamp": "2025-09-30 22:10:11.674706", + "step": 269, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.747999", + "step": 269, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0222756527364254, + "timestamp": "2025-09-30 22:10:11.752497", + "step": 270, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.820640", + "step": 270, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02620212733745575, + "timestamp": "2025-09-30 22:10:11.827843", + "step": 271, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:11.890053", + "step": 271, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019811127334833145, + "timestamp": "2025-09-30 22:10:11.902278", + "step": 272, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:11.969423", + "step": 272, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03354780375957489, + "timestamp": "2025-09-30 22:10:11.977684", + "step": 273, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:12.043117", + "step": 273, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011064850725233555, + "timestamp": "2025-09-30 22:10:12.052548", + "step": 274, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.123721", + "step": 274, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02540605701506138, + "timestamp": "2025-09-30 22:10:12.131500", + "step": 275, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:12.199338", + "step": 275, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022669294849038124, + "timestamp": "2025-09-30 22:10:12.206727", + "step": 276, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.268274", + "step": 276, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010532691143453121, + "timestamp": "2025-09-30 22:10:12.271962", + "step": 277, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.339414", + "step": 277, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03192717209458351, + "timestamp": "2025-09-30 22:10:12.346985", + "step": 278, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:12.407191", + "step": 278, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027782125398516655, + "timestamp": "2025-09-30 22:10:12.413865", + "step": 279, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:12.469832", + "step": 279, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04358547180891037, + "timestamp": "2025-09-30 22:10:12.484214", + "step": 280, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.545511", + "step": 280, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018636440858244896, + "timestamp": "2025-09-30 22:10:12.556165", + "step": 281, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.620115", + "step": 281, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01732656918466091, + "timestamp": "2025-09-30 22:10:12.623301", + "step": 282, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.680350", + "step": 282, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021487489342689514, + "timestamp": "2025-09-30 22:10:12.689464", + "step": 283, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:12.750692", + "step": 283, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031494829803705215, + "timestamp": "2025-09-30 22:10:12.757457", + "step": 284, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:12.818583", + "step": 284, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02197718620300293, + "timestamp": "2025-09-30 22:10:12.820809", + "step": 285, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:14.137228", + "step": 285, + "epoch": 1 + }, + { + "type": "pplx", + "content": 32380318.740329083, + "timestamp": "2025-09-30 22:10:14.140588", + "step": 285, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.192593", + "step": 285, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012143162079155445, + "timestamp": "2025-09-30 22:10:14.195045", + "step": 286, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:14.249538", + "step": 286, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018916593864560127, + "timestamp": "2025-09-30 22:10:14.252578", + "step": 287, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:14.309892", + "step": 287, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03274720534682274, + "timestamp": "2025-09-30 22:10:14.316172", + "step": 288, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.369291", + "step": 288, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02828342653810978, + "timestamp": "2025-09-30 22:10:14.373423", + "step": 289, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.429706", + "step": 289, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013313318602740765, + "timestamp": "2025-09-30 22:10:14.434084", + "step": 290, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.497217", + "step": 290, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022141721099615097, + "timestamp": "2025-09-30 22:10:14.499249", + "step": 291, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:14.557870", + "step": 291, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026357505470514297, + "timestamp": "2025-09-30 22:10:14.566973", + "step": 292, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.621517", + "step": 292, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01601177267730236, + "timestamp": "2025-09-30 22:10:14.631578", + "step": 293, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.685483", + "step": 293, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028203275054693222, + "timestamp": "2025-09-30 22:10:14.690289", + "step": 294, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.748068", + "step": 294, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019105268642306328, + "timestamp": "2025-09-30 22:10:14.751600", + "step": 295, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:14.806342", + "step": 295, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016064850613474846, + "timestamp": "2025-09-30 22:10:14.814490", + "step": 296, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.870963", + "step": 296, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020204851403832436, + "timestamp": "2025-09-30 22:10:14.872966", + "step": 297, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:14.926632", + "step": 297, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026143083348870277, + "timestamp": "2025-09-30 22:10:14.932947", + "step": 298, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:14.991347", + "step": 298, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015985313802957535, + "timestamp": "2025-09-30 22:10:14.997778", + "step": 299, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:15.061141", + "step": 299, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01684342697262764, + "timestamp": "2025-09-30 22:10:15.068483", + "step": 300, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.124062", + "step": 300, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02533099241554737, + "timestamp": "2025-09-30 22:10:15.128923", + "step": 301, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:15.184753", + "step": 301, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015226058661937714, + "timestamp": "2025-09-30 22:10:15.191351", + "step": 302, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.249276", + "step": 302, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013342132791876793, + "timestamp": "2025-09-30 22:10:15.253023", + "step": 303, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.308744", + "step": 303, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01799563132226467, + "timestamp": "2025-09-30 22:10:15.315259", + "step": 304, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.371443", + "step": 304, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02207523211836815, + "timestamp": "2025-09-30 22:10:15.374857", + "step": 305, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.433040", + "step": 305, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026255745440721512, + "timestamp": "2025-09-30 22:10:15.435725", + "step": 306, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.497514", + "step": 306, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02717163972556591, + "timestamp": "2025-09-30 22:10:15.501327", + "step": 307, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.556699", + "step": 307, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018035126850008965, + "timestamp": "2025-09-30 22:10:15.564100", + "step": 308, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.619353", + "step": 308, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009531312622129917, + "timestamp": "2025-09-30 22:10:15.622592", + "step": 309, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.680727", + "step": 309, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030687615275382996, + "timestamp": "2025-09-30 22:10:15.692879", + "step": 310, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.752017", + "step": 310, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029424799606204033, + "timestamp": "2025-09-30 22:10:15.754868", + "step": 311, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.819561", + "step": 311, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019238030537962914, + "timestamp": "2025-09-30 22:10:15.826004", + "step": 312, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.886926", + "step": 312, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024589749053120613, + "timestamp": "2025-09-30 22:10:15.894310", + "step": 313, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:15.957840", + "step": 313, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017089612782001495, + "timestamp": "2025-09-30 22:10:15.961295", + "step": 314, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.027620", + "step": 314, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029858587309718132, + "timestamp": "2025-09-30 22:10:16.030633", + "step": 315, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.090178", + "step": 315, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008975454606115818, + "timestamp": "2025-09-30 22:10:16.099786", + "step": 316, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.157823", + "step": 316, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01512650866061449, + "timestamp": "2025-09-30 22:10:16.160346", + "step": 317, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:16.215635", + "step": 317, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027577145025134087, + "timestamp": "2025-09-30 22:10:16.219071", + "step": 318, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.274323", + "step": 318, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017553279176354408, + "timestamp": "2025-09-30 22:10:16.277723", + "step": 319, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:16.336932", + "step": 319, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019269872456789017, + "timestamp": "2025-09-30 22:10:16.343333", + "step": 320, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:16.397626", + "step": 320, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019443640485405922, + "timestamp": "2025-09-30 22:10:16.405909", + "step": 321, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.465354", + "step": 321, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03317169472575188, + "timestamp": "2025-09-30 22:10:16.467887", + "step": 322, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.528175", + "step": 322, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029915081337094307, + "timestamp": "2025-09-30 22:10:16.532803", + "step": 323, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:16.587604", + "step": 323, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02484137937426567, + "timestamp": "2025-09-30 22:10:16.596854", + "step": 324, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:16.662203", + "step": 324, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022063011303544044, + "timestamp": "2025-09-30 22:10:16.665242", + "step": 325, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.720220", + "step": 325, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0251996461302042, + "timestamp": "2025-09-30 22:10:16.723926", + "step": 326, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.778169", + "step": 326, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027159083634614944, + "timestamp": "2025-09-30 22:10:16.781078", + "step": 327, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:16.837381", + "step": 327, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02359570376574993, + "timestamp": "2025-09-30 22:10:16.843453", + "step": 328, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:16.897805", + "step": 328, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028713062405586243, + "timestamp": "2025-09-30 22:10:16.900001", + "step": 329, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:16.958749", + "step": 329, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028622686862945557, + "timestamp": "2025-09-30 22:10:16.962010", + "step": 330, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:17.019372", + "step": 330, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01968633010983467, + "timestamp": "2025-09-30 22:10:17.022951", + "step": 331, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:17.077163", + "step": 331, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01846577599644661, + "timestamp": "2025-09-30 22:10:17.082746", + "step": 332, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:17.136997", + "step": 332, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026781436055898666, + "timestamp": "2025-09-30 22:10:17.139637", + "step": 333, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:17.194116", + "step": 333, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021621640771627426, + "timestamp": "2025-09-30 22:10:17.196713", + "step": 334, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:17.249465", + "step": 334, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0181210245937109, + "timestamp": "2025-09-30 22:10:17.255248", + "step": 335, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:17.318124", + "step": 335, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019643476232886314, + "timestamp": "2025-09-30 22:10:17.324116", + "step": 336, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:17.377700", + "step": 336, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023817529901862144, + "timestamp": "2025-09-30 22:10:17.380338", + "step": 337, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:17.434506", + "step": 337, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017907124012708664, + "timestamp": "2025-09-30 22:10:17.437861", + "step": 338, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:17.490611", + "step": 338, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025304758921265602, + "timestamp": "2025-09-30 22:10:17.494601", + "step": 339, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:17.549116", + "step": 339, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01915588602423668, + "timestamp": "2025-09-30 22:10:17.555595", + "step": 340, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:17.609286", + "step": 340, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019800949841737747, + "timestamp": "2025-09-30 22:10:17.616037", + "step": 341, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:17.671375", + "step": 341, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027113808318972588, + "timestamp": "2025-09-30 22:10:17.677090", + "step": 342, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:18.928133", + "step": 342, + "epoch": 1 + }, + { + "type": "pplx", + "content": 33118764.612160176, + "timestamp": "2025-09-30 22:10:18.930856", + "step": 342, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:18.982725", + "step": 342, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021984072402119637, + "timestamp": "2025-09-30 22:10:18.987387", + "step": 343, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.044229", + "step": 343, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03714148327708244, + "timestamp": "2025-09-30 22:10:19.050010", + "step": 344, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.103528", + "step": 344, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02487654611468315, + "timestamp": "2025-09-30 22:10:19.106751", + "step": 345, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:19.161194", + "step": 345, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027115579694509506, + "timestamp": "2025-09-30 22:10:19.164115", + "step": 346, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:19.218126", + "step": 346, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021236712113022804, + "timestamp": "2025-09-30 22:10:19.220655", + "step": 347, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.273985", + "step": 347, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02278432808816433, + "timestamp": "2025-09-30 22:10:19.279822", + "step": 348, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:19.332898", + "step": 348, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018770869821310043, + "timestamp": "2025-09-30 22:10:19.335724", + "step": 349, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.390109", + "step": 349, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02387206256389618, + "timestamp": "2025-09-30 22:10:19.393380", + "step": 350, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.449148", + "step": 350, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029281629249453545, + "timestamp": "2025-09-30 22:10:19.452340", + "step": 351, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.505888", + "step": 351, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023951295763254166, + "timestamp": "2025-09-30 22:10:19.512866", + "step": 352, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.572066", + "step": 352, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033801205456256866, + "timestamp": "2025-09-30 22:10:19.574004", + "step": 353, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:19.628517", + "step": 353, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02057676389813423, + "timestamp": "2025-09-30 22:10:19.630640", + "step": 354, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.684004", + "step": 354, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024962907657027245, + "timestamp": "2025-09-30 22:10:19.686449", + "step": 355, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.739625", + "step": 355, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015609494410455227, + "timestamp": "2025-09-30 22:10:19.745689", + "step": 356, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:19.801726", + "step": 356, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02138134464621544, + "timestamp": "2025-09-30 22:10:19.804873", + "step": 357, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.857927", + "step": 357, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01888282783329487, + "timestamp": "2025-09-30 22:10:19.860392", + "step": 358, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:19.913884", + "step": 358, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01716557890176773, + "timestamp": "2025-09-30 22:10:19.916491", + "step": 359, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:19.970266", + "step": 359, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01593884639441967, + "timestamp": "2025-09-30 22:10:19.976418", + "step": 360, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:20.035102", + "step": 360, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024571670219302177, + "timestamp": "2025-09-30 22:10:20.038125", + "step": 361, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.092636", + "step": 361, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02865268848836422, + "timestamp": "2025-09-30 22:10:20.094879", + "step": 362, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:20.149660", + "step": 362, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020879996940493584, + "timestamp": "2025-09-30 22:10:20.152442", + "step": 363, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.207160", + "step": 363, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015153266489505768, + "timestamp": "2025-09-30 22:10:20.213071", + "step": 364, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.265866", + "step": 364, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026744915172457695, + "timestamp": "2025-09-30 22:10:20.269002", + "step": 365, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:20.323260", + "step": 365, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014960569329559803, + "timestamp": "2025-09-30 22:10:20.326362", + "step": 366, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:20.380718", + "step": 366, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019018033519387245, + "timestamp": "2025-09-30 22:10:20.383960", + "step": 367, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.438786", + "step": 367, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027187785133719444, + "timestamp": "2025-09-30 22:10:20.444869", + "step": 368, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.499318", + "step": 368, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02450934611260891, + "timestamp": "2025-09-30 22:10:20.501389", + "step": 369, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:20.561671", + "step": 369, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02450348250567913, + "timestamp": "2025-09-30 22:10:20.564282", + "step": 370, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.618016", + "step": 370, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013314715586602688, + "timestamp": "2025-09-30 22:10:20.620695", + "step": 371, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.681024", + "step": 371, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03389272093772888, + "timestamp": "2025-09-30 22:10:20.686886", + "step": 372, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:20.743579", + "step": 372, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023728221654891968, + "timestamp": "2025-09-30 22:10:20.746212", + "step": 373, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.805459", + "step": 373, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02945546805858612, + "timestamp": "2025-09-30 22:10:20.808132", + "step": 374, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.862539", + "step": 374, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016989445313811302, + "timestamp": "2025-09-30 22:10:20.865660", + "step": 375, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.919975", + "step": 375, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016743594780564308, + "timestamp": "2025-09-30 22:10:20.925919", + "step": 376, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:20.983291", + "step": 376, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012260585092008114, + "timestamp": "2025-09-30 22:10:20.985700", + "step": 377, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.041935", + "step": 377, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018505526706576347, + "timestamp": "2025-09-30 22:10:21.044221", + "step": 378, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.099361", + "step": 378, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02107120119035244, + "timestamp": "2025-09-30 22:10:21.101514", + "step": 379, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:21.159867", + "step": 379, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02227369323372841, + "timestamp": "2025-09-30 22:10:21.165467", + "step": 380, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.218242", + "step": 380, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025716906413435936, + "timestamp": "2025-09-30 22:10:21.220708", + "step": 381, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:21.278757", + "step": 381, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019525570794939995, + "timestamp": "2025-09-30 22:10:21.280994", + "step": 382, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.342716", + "step": 382, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012794404290616512, + "timestamp": "2025-09-30 22:10:21.345127", + "step": 383, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:21.401362", + "step": 383, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02354384958744049, + "timestamp": "2025-09-30 22:10:21.406613", + "step": 384, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:21.464824", + "step": 384, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010914976708590984, + "timestamp": "2025-09-30 22:10:21.468655", + "step": 385, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.522272", + "step": 385, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017349114641547203, + "timestamp": "2025-09-30 22:10:21.524393", + "step": 386, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.583430", + "step": 386, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018263814970850945, + "timestamp": "2025-09-30 22:10:21.586006", + "step": 387, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.649324", + "step": 387, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019998768344521523, + "timestamp": "2025-09-30 22:10:21.654921", + "step": 388, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:21.715235", + "step": 388, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016233079135417938, + "timestamp": "2025-09-30 22:10:21.717707", + "step": 389, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.775916", + "step": 389, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02938160113990307, + "timestamp": "2025-09-30 22:10:21.778133", + "step": 390, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.833207", + "step": 390, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03876896947622299, + "timestamp": "2025-09-30 22:10:21.835192", + "step": 391, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.891248", + "step": 391, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013861365616321564, + "timestamp": "2025-09-30 22:10:21.897422", + "step": 392, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:21.953983", + "step": 392, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01384772453457117, + "timestamp": "2025-09-30 22:10:21.956099", + "step": 393, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:22.011307", + "step": 393, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020624876022338867, + "timestamp": "2025-09-30 22:10:22.013443", + "step": 394, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:22.067347", + "step": 394, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00986799132078886, + "timestamp": "2025-09-30 22:10:22.069504", + "step": 395, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:22.125853", + "step": 395, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023464640602469444, + "timestamp": "2025-09-30 22:10:22.131474", + "step": 396, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:22.190813", + "step": 396, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009329124353826046, + "timestamp": "2025-09-30 22:10:22.193032", + "step": 397, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:22.251437", + "step": 397, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013422129675745964, + "timestamp": "2025-09-30 22:10:22.253601", + "step": 398, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:22.313231", + "step": 398, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023139040917158127, + "timestamp": "2025-09-30 22:10:22.315268", + "step": 399, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:23.660296", + "step": 399, + "epoch": 1 + }, + { + "type": "pplx", + "content": 40589601.62274881, + "timestamp": "2025-09-30 22:10:23.662816", + "step": 399, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:23.715616", + "step": 399, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01724778302013874, + "timestamp": "2025-09-30 22:10:23.721952", + "step": 400, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:23.777851", + "step": 400, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008374908939003944, + "timestamp": "2025-09-30 22:10:23.780509", + "step": 401, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:23.835680", + "step": 401, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01957196369767189, + "timestamp": "2025-09-30 22:10:23.838371", + "step": 402, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:23.894866", + "step": 402, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018934715539216995, + "timestamp": "2025-09-30 22:10:23.897082", + "step": 403, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:23.953152", + "step": 403, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03373418375849724, + "timestamp": "2025-09-30 22:10:23.962739", + "step": 404, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:24.018580", + "step": 404, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027509469538927078, + "timestamp": "2025-09-30 22:10:24.022584", + "step": 405, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:24.078777", + "step": 405, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03421385958790779, + "timestamp": "2025-09-30 22:10:24.081776", + "step": 406, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.138442", + "step": 406, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011767718009650707, + "timestamp": "2025-09-30 22:10:24.140810", + "step": 407, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:24.197396", + "step": 407, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009523920714855194, + "timestamp": "2025-09-30 22:10:24.203005", + "step": 408, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.258085", + "step": 408, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023020844906568527, + "timestamp": "2025-09-30 22:10:24.261501", + "step": 409, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.321430", + "step": 409, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020535219460725784, + "timestamp": "2025-09-30 22:10:24.324678", + "step": 410, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:24.384770", + "step": 410, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01701906882226467, + "timestamp": "2025-09-30 22:10:24.391280", + "step": 411, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:24.449399", + "step": 411, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03201786056160927, + "timestamp": "2025-09-30 22:10:24.457737", + "step": 412, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.512109", + "step": 412, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011379254050552845, + "timestamp": "2025-09-30 22:10:24.515992", + "step": 413, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.584178", + "step": 413, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03204244002699852, + "timestamp": "2025-09-30 22:10:24.589115", + "step": 414, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.645428", + "step": 414, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035000745207071304, + "timestamp": "2025-09-30 22:10:24.648528", + "step": 415, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.708676", + "step": 415, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011228829622268677, + "timestamp": "2025-09-30 22:10:24.715697", + "step": 416, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.774441", + "step": 416, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015902062878012657, + "timestamp": "2025-09-30 22:10:24.778489", + "step": 417, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.837328", + "step": 417, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029384631663560867, + "timestamp": "2025-09-30 22:10:24.839269", + "step": 418, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:24.893599", + "step": 418, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022646890953183174, + "timestamp": "2025-09-30 22:10:24.897402", + "step": 419, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:24.953933", + "step": 419, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015414858236908913, + "timestamp": "2025-09-30 22:10:24.960738", + "step": 420, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.018909", + "step": 420, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015387741848826408, + "timestamp": "2025-09-30 22:10:25.021339", + "step": 421, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:25.076031", + "step": 421, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016175920143723488, + "timestamp": "2025-09-30 22:10:25.078361", + "step": 422, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:25.140468", + "step": 422, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020924212411046028, + "timestamp": "2025-09-30 22:10:25.143111", + "step": 423, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:25.198775", + "step": 423, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015957873314619064, + "timestamp": "2025-09-30 22:10:25.204587", + "step": 424, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:25.257981", + "step": 424, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018746186047792435, + "timestamp": "2025-09-30 22:10:25.260168", + "step": 425, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:25.317623", + "step": 425, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028188351541757584, + "timestamp": "2025-09-30 22:10:25.319833", + "step": 426, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.374649", + "step": 426, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015842631459236145, + "timestamp": "2025-09-30 22:10:25.377221", + "step": 427, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.436214", + "step": 427, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016134170815348625, + "timestamp": "2025-09-30 22:10:25.442167", + "step": 428, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.498915", + "step": 428, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040546927601099014, + "timestamp": "2025-09-30 22:10:25.500908", + "step": 429, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:25.561078", + "step": 429, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016925180330872536, + "timestamp": "2025-09-30 22:10:25.565189", + "step": 430, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.620787", + "step": 430, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01041901670396328, + "timestamp": "2025-09-30 22:10:25.627804", + "step": 431, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:25.688574", + "step": 431, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01679927296936512, + "timestamp": "2025-09-30 22:10:25.694911", + "step": 432, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.754989", + "step": 432, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01948891021311283, + "timestamp": "2025-09-30 22:10:25.758013", + "step": 433, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:25.813575", + "step": 433, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026692839339375496, + "timestamp": "2025-09-30 22:10:25.820015", + "step": 434, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.876179", + "step": 434, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021725250408053398, + "timestamp": "2025-09-30 22:10:25.880139", + "step": 435, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:25.934999", + "step": 435, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016403162851929665, + "timestamp": "2025-09-30 22:10:25.953512", + "step": 436, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.011910", + "step": 436, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02581772208213806, + "timestamp": "2025-09-30 22:10:26.019119", + "step": 437, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.077043", + "step": 437, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023395583033561707, + "timestamp": "2025-09-30 22:10:26.081776", + "step": 438, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.138996", + "step": 438, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03953873738646507, + "timestamp": "2025-09-30 22:10:26.142696", + "step": 439, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.198275", + "step": 439, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018297594040632248, + "timestamp": "2025-09-30 22:10:26.206228", + "step": 440, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.262605", + "step": 440, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030052199959754944, + "timestamp": "2025-09-30 22:10:26.266573", + "step": 441, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.326102", + "step": 441, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029010707512497902, + "timestamp": "2025-09-30 22:10:26.329939", + "step": 442, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:26.392124", + "step": 442, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013197534717619419, + "timestamp": "2025-09-30 22:10:26.395865", + "step": 443, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:26.455099", + "step": 443, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03683071583509445, + "timestamp": "2025-09-30 22:10:26.462646", + "step": 444, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.518023", + "step": 444, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01791354827582836, + "timestamp": "2025-09-30 22:10:26.522558", + "step": 445, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.580204", + "step": 445, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01698167435824871, + "timestamp": "2025-09-30 22:10:26.584622", + "step": 446, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:26.644908", + "step": 446, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012840881943702698, + "timestamp": "2025-09-30 22:10:26.649589", + "step": 447, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.706363", + "step": 447, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021327754482626915, + "timestamp": "2025-09-30 22:10:26.712053", + "step": 448, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:26.770017", + "step": 448, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012066961266100407, + "timestamp": "2025-09-30 22:10:26.775318", + "step": 449, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.837781", + "step": 449, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028330016881227493, + "timestamp": "2025-09-30 22:10:26.846404", + "step": 450, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.909229", + "step": 450, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023282703012228012, + "timestamp": "2025-09-30 22:10:26.912219", + "step": 451, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:26.966278", + "step": 451, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010071339085698128, + "timestamp": "2025-09-30 22:10:26.976341", + "step": 452, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:27.043117", + "step": 452, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01836954988539219, + "timestamp": "2025-09-30 22:10:27.046716", + "step": 453, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:27.101355", + "step": 453, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025107625871896744, + "timestamp": "2025-09-30 22:10:27.103861", + "step": 454, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:27.166366", + "step": 454, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02213280089199543, + "timestamp": "2025-09-30 22:10:27.172278", + "step": 455, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:27.230762", + "step": 455, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012315492145717144, + "timestamp": "2025-09-30 22:10:27.239089", + "step": 456, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:28.975277", + "step": 456, + "epoch": 1 + }, + { + "type": "pplx", + "content": 43492757.85152588, + "timestamp": "2025-09-30 22:10:28.977567", + "step": 456, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.039128", + "step": 456, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02141198329627514, + "timestamp": "2025-09-30 22:10:29.048866", + "step": 457, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:29.110780", + "step": 457, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0336587019264698, + "timestamp": "2025-09-30 22:10:29.115494", + "step": 458, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.173645", + "step": 458, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014187236316502094, + "timestamp": "2025-09-30 22:10:29.180707", + "step": 459, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:29.240772", + "step": 459, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026401251554489136, + "timestamp": "2025-09-30 22:10:29.247468", + "step": 460, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.310695", + "step": 460, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020034367218613625, + "timestamp": "2025-09-30 22:10:29.317981", + "step": 461, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.376268", + "step": 461, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011318235658109188, + "timestamp": "2025-09-30 22:10:29.382341", + "step": 462, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.443870", + "step": 462, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009131813421845436, + "timestamp": "2025-09-30 22:10:29.452675", + "step": 463, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.513953", + "step": 463, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020363813266158104, + "timestamp": "2025-09-30 22:10:29.520672", + "step": 464, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.585071", + "step": 464, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019311266019940376, + "timestamp": "2025-09-30 22:10:29.587763", + "step": 465, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.642703", + "step": 465, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01495366357266903, + "timestamp": "2025-09-30 22:10:29.645362", + "step": 466, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.709787", + "step": 466, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023592934012413025, + "timestamp": "2025-09-30 22:10:29.715529", + "step": 467, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.776645", + "step": 467, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027076885104179382, + "timestamp": "2025-09-30 22:10:29.785682", + "step": 468, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.843777", + "step": 468, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02357049658894539, + "timestamp": "2025-09-30 22:10:29.849824", + "step": 469, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.915063", + "step": 469, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02975117228925228, + "timestamp": "2025-09-30 22:10:29.917443", + "step": 470, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:29.974919", + "step": 470, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0073095522820949554, + "timestamp": "2025-09-30 22:10:29.981202", + "step": 471, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.042258", + "step": 471, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01751861535012722, + "timestamp": "2025-09-30 22:10:30.048005", + "step": 472, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:30.104524", + "step": 472, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017867306247353554, + "timestamp": "2025-09-30 22:10:30.107760", + "step": 473, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:30.163190", + "step": 473, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018450839444994926, + "timestamp": "2025-09-30 22:10:30.165609", + "step": 474, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:30.243448", + "step": 474, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011997430585324764, + "timestamp": "2025-09-30 22:10:30.245751", + "step": 475, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.303755", + "step": 475, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012514320202171803, + "timestamp": "2025-09-30 22:10:30.310204", + "step": 476, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.369301", + "step": 476, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017777670174837112, + "timestamp": "2025-09-30 22:10:30.371917", + "step": 477, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.429023", + "step": 477, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01758970133960247, + "timestamp": "2025-09-30 22:10:30.435678", + "step": 478, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.492913", + "step": 478, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01861104555428028, + "timestamp": "2025-09-30 22:10:30.495572", + "step": 479, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.558831", + "step": 479, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012014762498438358, + "timestamp": "2025-09-30 22:10:30.565157", + "step": 480, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.623237", + "step": 480, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022567814216017723, + "timestamp": "2025-09-30 22:10:30.632032", + "step": 481, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.696343", + "step": 481, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02221139892935753, + "timestamp": "2025-09-30 22:10:30.700014", + "step": 482, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:30.755962", + "step": 482, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014111381955444813, + "timestamp": "2025-09-30 22:10:30.767462", + "step": 483, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.829136", + "step": 483, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01386257540434599, + "timestamp": "2025-09-30 22:10:30.844391", + "step": 484, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:30.905039", + "step": 484, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019835516810417175, + "timestamp": "2025-09-30 22:10:30.914402", + "step": 485, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:30.979295", + "step": 485, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015690794214606285, + "timestamp": "2025-09-30 22:10:30.984650", + "step": 486, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:31.042843", + "step": 486, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028532350435853004, + "timestamp": "2025-09-30 22:10:31.045845", + "step": 487, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.101781", + "step": 487, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024972129613161087, + "timestamp": "2025-09-30 22:10:31.112277", + "step": 488, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.171834", + "step": 488, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018831370398402214, + "timestamp": "2025-09-30 22:10:31.174381", + "step": 489, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:31.230777", + "step": 489, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014749753288924694, + "timestamp": "2025-09-30 22:10:31.233694", + "step": 490, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.289017", + "step": 490, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013099046424031258, + "timestamp": "2025-09-30 22:10:31.298818", + "step": 491, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:31.370536", + "step": 491, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009160849265754223, + "timestamp": "2025-09-30 22:10:31.377012", + "step": 492, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.442153", + "step": 492, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016461042687296867, + "timestamp": "2025-09-30 22:10:31.451114", + "step": 493, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.512076", + "step": 493, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.043794337660074234, + "timestamp": "2025-09-30 22:10:31.515396", + "step": 494, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.573231", + "step": 494, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022364290431141853, + "timestamp": "2025-09-30 22:10:31.582922", + "step": 495, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.640558", + "step": 495, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021926935762166977, + "timestamp": "2025-09-30 22:10:31.647384", + "step": 496, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:31.707818", + "step": 496, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00889673549681902, + "timestamp": "2025-09-30 22:10:31.709679", + "step": 497, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:31.796642", + "step": 497, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01593521051108837, + "timestamp": "2025-09-30 22:10:31.799398", + "step": 498, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.855056", + "step": 498, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01839728094637394, + "timestamp": "2025-09-30 22:10:31.872085", + "step": 499, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:31.931769", + "step": 499, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010415417142212391, + "timestamp": "2025-09-30 22:10:31.944103", + "step": 500, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 500", + "timestamp": "2025-09-30 22:10:32.385970", + "step": 500, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:32.457155", + "step": 500, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009094706736505032, + "timestamp": "2025-09-30 22:10:32.460338", + "step": 501, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:32.530121", + "step": 501, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021521523594856262, + "timestamp": "2025-09-30 22:10:32.534049", + "step": 502, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:32.597198", + "step": 502, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01767658442258835, + "timestamp": "2025-09-30 22:10:32.600463", + "step": 503, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:32.662752", + "step": 503, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01816992275416851, + "timestamp": "2025-09-30 22:10:32.670715", + "step": 504, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:32.744440", + "step": 504, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005588170140981674, + "timestamp": "2025-09-30 22:10:32.747345", + "step": 505, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:32.802870", + "step": 505, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04537493363022804, + "timestamp": "2025-09-30 22:10:32.805853", + "step": 506, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:32.886649", + "step": 506, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028734682127833366, + "timestamp": "2025-09-30 22:10:32.891721", + "step": 507, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:32.961030", + "step": 507, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030345702543854713, + "timestamp": "2025-09-30 22:10:32.973246", + "step": 508, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:33.039773", + "step": 508, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008057617582380772, + "timestamp": "2025-09-30 22:10:33.042933", + "step": 509, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:33.100278", + "step": 509, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013696548528969288, + "timestamp": "2025-09-30 22:10:33.102771", + "step": 510, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:33.162158", + "step": 510, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024826964363455772, + "timestamp": "2025-09-30 22:10:33.166225", + "step": 511, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:33.224101", + "step": 511, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00507523724809289, + "timestamp": "2025-09-30 22:10:33.231387", + "step": 512, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:33.287069", + "step": 512, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019475262612104416, + "timestamp": "2025-09-30 22:10:33.292055", + "step": 513, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:34.768428", + "step": 513, + "epoch": 1 + }, + { + "type": "pplx", + "content": 46595227.32665869, + "timestamp": "2025-09-30 22:10:34.775585", + "step": 513, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:34.833990", + "step": 513, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007841977290809155, + "timestamp": "2025-09-30 22:10:34.845351", + "step": 514, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:34.908910", + "step": 514, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02670980803668499, + "timestamp": "2025-09-30 22:10:34.911739", + "step": 515, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:34.971584", + "step": 515, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017184732481837273, + "timestamp": "2025-09-30 22:10:34.984326", + "step": 516, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:35.047781", + "step": 516, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0331471748650074, + "timestamp": "2025-09-30 22:10:35.055702", + "step": 517, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.112052", + "step": 517, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026939330622553825, + "timestamp": "2025-09-30 22:10:35.114808", + "step": 518, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.171714", + "step": 518, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02031738869845867, + "timestamp": "2025-09-30 22:10:35.181994", + "step": 519, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:35.247523", + "step": 519, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009875715710222721, + "timestamp": "2025-09-30 22:10:35.260487", + "step": 520, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:35.326048", + "step": 520, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022956620901823044, + "timestamp": "2025-09-30 22:10:35.329588", + "step": 521, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.405846", + "step": 521, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02863324247300625, + "timestamp": "2025-09-30 22:10:35.415191", + "step": 522, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.480290", + "step": 522, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010198934003710747, + "timestamp": "2025-09-30 22:10:35.488332", + "step": 523, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:35.554304", + "step": 523, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006696059834212065, + "timestamp": "2025-09-30 22:10:35.561717", + "step": 524, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.625076", + "step": 524, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014501352794468403, + "timestamp": "2025-09-30 22:10:35.629654", + "step": 525, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.686952", + "step": 525, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024316150695085526, + "timestamp": "2025-09-30 22:10:35.689892", + "step": 526, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:35.746425", + "step": 526, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014118111692368984, + "timestamp": "2025-09-30 22:10:35.750028", + "step": 527, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.817322", + "step": 527, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013456260785460472, + "timestamp": "2025-09-30 22:10:35.829823", + "step": 528, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.887982", + "step": 528, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010260426439344883, + "timestamp": "2025-09-30 22:10:35.895282", + "step": 529, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:35.958752", + "step": 529, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01042400486767292, + "timestamp": "2025-09-30 22:10:35.962803", + "step": 530, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.028568", + "step": 530, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027442054823040962, + "timestamp": "2025-09-30 22:10:36.031254", + "step": 531, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.092057", + "step": 531, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022459326311945915, + "timestamp": "2025-09-30 22:10:36.104947", + "step": 532, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.168237", + "step": 532, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02310291863977909, + "timestamp": "2025-09-30 22:10:36.171118", + "step": 533, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.227533", + "step": 533, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02492346242070198, + "timestamp": "2025-09-30 22:10:36.233507", + "step": 534, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.294215", + "step": 534, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01682530902326107, + "timestamp": "2025-09-30 22:10:36.299767", + "step": 535, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.361267", + "step": 535, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007360723335295916, + "timestamp": "2025-09-30 22:10:36.367897", + "step": 536, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.423199", + "step": 536, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03802620619535446, + "timestamp": "2025-09-30 22:10:36.426126", + "step": 537, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.486427", + "step": 537, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014312833547592163, + "timestamp": "2025-09-30 22:10:36.491959", + "step": 538, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.552805", + "step": 538, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04087325558066368, + "timestamp": "2025-09-30 22:10:36.559559", + "step": 539, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.628565", + "step": 539, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015957888215780258, + "timestamp": "2025-09-30 22:10:36.638319", + "step": 540, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.697802", + "step": 540, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02199072763323784, + "timestamp": "2025-09-30 22:10:36.707124", + "step": 541, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.770624", + "step": 541, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018141640350222588, + "timestamp": "2025-09-30 22:10:36.774492", + "step": 542, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:36.840242", + "step": 542, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011592322029173374, + "timestamp": "2025-09-30 22:10:36.843581", + "step": 543, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.909312", + "step": 543, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023553509265184402, + "timestamp": "2025-09-30 22:10:36.920894", + "step": 544, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:36.983571", + "step": 544, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025153178721666336, + "timestamp": "2025-09-30 22:10:36.990509", + "step": 545, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.055055", + "step": 545, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024465525522828102, + "timestamp": "2025-09-30 22:10:37.061105", + "step": 546, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.122633", + "step": 546, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036670148372650146, + "timestamp": "2025-09-30 22:10:37.129338", + "step": 547, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:37.190024", + "step": 547, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022738128900527954, + "timestamp": "2025-09-30 22:10:37.197170", + "step": 548, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.258684", + "step": 548, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03440006449818611, + "timestamp": "2025-09-30 22:10:37.265618", + "step": 549, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.323252", + "step": 549, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030233899131417274, + "timestamp": "2025-09-30 22:10:37.326686", + "step": 550, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.395601", + "step": 550, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021326003596186638, + "timestamp": "2025-09-30 22:10:37.399142", + "step": 551, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:37.460428", + "step": 551, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03364909067749977, + "timestamp": "2025-09-30 22:10:37.470963", + "step": 552, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.532526", + "step": 552, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027066290378570557, + "timestamp": "2025-09-30 22:10:37.538400", + "step": 553, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:37.605612", + "step": 553, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02852269820868969, + "timestamp": "2025-09-30 22:10:37.615557", + "step": 554, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:37.696930", + "step": 554, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009644846431910992, + "timestamp": "2025-09-30 22:10:37.706231", + "step": 555, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:37.782232", + "step": 555, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024163635447621346, + "timestamp": "2025-09-30 22:10:37.791433", + "step": 556, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:37.848943", + "step": 556, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017797913402318954, + "timestamp": "2025-09-30 22:10:37.851761", + "step": 557, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:37.911890", + "step": 557, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015675710514187813, + "timestamp": "2025-09-30 22:10:37.914217", + "step": 558, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:37.983298", + "step": 558, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017431657761335373, + "timestamp": "2025-09-30 22:10:37.987332", + "step": 559, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:38.065562", + "step": 559, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011734207160770893, + "timestamp": "2025-09-30 22:10:38.072800", + "step": 560, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.128930", + "step": 560, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016993921250104904, + "timestamp": "2025-09-30 22:10:38.131406", + "step": 561, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.190303", + "step": 561, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019728850573301315, + "timestamp": "2025-09-30 22:10:38.193454", + "step": 562, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.261817", + "step": 562, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02789020538330078, + "timestamp": "2025-09-30 22:10:38.274576", + "step": 563, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.332263", + "step": 563, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030427774414420128, + "timestamp": "2025-09-30 22:10:38.339142", + "step": 564, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:38.396018", + "step": 564, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02213878743350506, + "timestamp": "2025-09-30 22:10:38.409641", + "step": 565, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.472316", + "step": 565, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019610082730650902, + "timestamp": "2025-09-30 22:10:38.474674", + "step": 566, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.533261", + "step": 566, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01741156354546547, + "timestamp": "2025-09-30 22:10:38.540039", + "step": 567, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:38.601991", + "step": 567, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022262096405029297, + "timestamp": "2025-09-30 22:10:38.609836", + "step": 568, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.667090", + "step": 568, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024254482239484787, + "timestamp": "2025-09-30 22:10:38.671827", + "step": 569, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:38.733470", + "step": 569, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017775481566786766, + "timestamp": "2025-09-30 22:10:38.736235", + "step": 570, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:40.258822", + "step": 570, + "epoch": 1 + }, + { + "type": "pplx", + "content": 47270191.72182987, + "timestamp": "2025-09-30 22:10:40.261088", + "step": 570, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.317993", + "step": 570, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012138426303863525, + "timestamp": "2025-09-30 22:10:40.320234", + "step": 571, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.376160", + "step": 571, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011941500008106232, + "timestamp": "2025-09-30 22:10:40.382939", + "step": 572, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:40.458446", + "step": 572, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045111771672964096, + "timestamp": "2025-09-30 22:10:40.468771", + "step": 573, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.533176", + "step": 573, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026581604033708572, + "timestamp": "2025-09-30 22:10:40.535322", + "step": 574, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.592110", + "step": 574, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017682049423456192, + "timestamp": "2025-09-30 22:10:40.594331", + "step": 575, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.650481", + "step": 575, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010170449502766132, + "timestamp": "2025-09-30 22:10:40.657729", + "step": 576, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:40.715792", + "step": 576, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02111336775124073, + "timestamp": "2025-09-30 22:10:40.718679", + "step": 577, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:40.782680", + "step": 577, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026711730286478996, + "timestamp": "2025-09-30 22:10:40.787449", + "step": 578, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.854518", + "step": 578, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0173348356038332, + "timestamp": "2025-09-30 22:10:40.857175", + "step": 579, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:40.915291", + "step": 579, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015113255940377712, + "timestamp": "2025-09-30 22:10:40.926331", + "step": 580, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:40.990132", + "step": 580, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014600790105760098, + "timestamp": "2025-09-30 22:10:40.997747", + "step": 581, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.073398", + "step": 581, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007143207360059023, + "timestamp": "2025-09-30 22:10:41.076344", + "step": 582, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:41.138532", + "step": 582, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020687399432063103, + "timestamp": "2025-09-30 22:10:41.141022", + "step": 583, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:41.201736", + "step": 583, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01433294266462326, + "timestamp": "2025-09-30 22:10:41.213194", + "step": 584, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.273627", + "step": 584, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016348710283637047, + "timestamp": "2025-09-30 22:10:41.280786", + "step": 585, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.342328", + "step": 585, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0035052443854510784, + "timestamp": "2025-09-30 22:10:41.347937", + "step": 586, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.414579", + "step": 586, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.002743300748988986, + "timestamp": "2025-09-30 22:10:41.416873", + "step": 587, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:41.472859", + "step": 587, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047513365745544434, + "timestamp": "2025-09-30 22:10:41.480545", + "step": 588, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.546176", + "step": 588, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040245767682790756, + "timestamp": "2025-09-30 22:10:41.549512", + "step": 589, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.609742", + "step": 589, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02579033002257347, + "timestamp": "2025-09-30 22:10:41.612831", + "step": 590, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-30 22:10:41.696774", + "step": 590, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0405263788998127, + "timestamp": "2025-09-30 22:10:41.700437", + "step": 591, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.757185", + "step": 591, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021822741255164146, + "timestamp": "2025-09-30 22:10:41.764096", + "step": 592, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.820886", + "step": 592, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027860483154654503, + "timestamp": "2025-09-30 22:10:41.828222", + "step": 593, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:41.885937", + "step": 593, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004523556213825941, + "timestamp": "2025-09-30 22:10:41.894777", + "step": 594, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:41.958205", + "step": 594, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02394627407193184, + "timestamp": "2025-09-30 22:10:41.962740", + "step": 595, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.021860", + "step": 595, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038400568068027496, + "timestamp": "2025-09-30 22:10:42.029446", + "step": 596, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:42.091560", + "step": 596, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020109396427869797, + "timestamp": "2025-09-30 22:10:42.094174", + "step": 597, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.165711", + "step": 597, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01921306923031807, + "timestamp": "2025-09-30 22:10:42.174038", + "step": 598, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:42.237330", + "step": 598, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035011257976293564, + "timestamp": "2025-09-30 22:10:42.240295", + "step": 599, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.298368", + "step": 599, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022402983158826828, + "timestamp": "2025-09-30 22:10:42.306110", + "step": 600, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.369255", + "step": 600, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023800494149327278, + "timestamp": "2025-09-30 22:10:42.372114", + "step": 601, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.428097", + "step": 601, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01819646917283535, + "timestamp": "2025-09-30 22:10:42.431036", + "step": 602, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.490355", + "step": 602, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02224593423306942, + "timestamp": "2025-09-30 22:10:42.497290", + "step": 603, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.558386", + "step": 603, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0203517097979784, + "timestamp": "2025-09-30 22:10:42.564695", + "step": 604, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.627576", + "step": 604, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019523965194821358, + "timestamp": "2025-09-30 22:10:42.631531", + "step": 605, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:42.688275", + "step": 605, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027153076604008675, + "timestamp": "2025-09-30 22:10:42.690498", + "step": 606, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.755120", + "step": 606, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02785666286945343, + "timestamp": "2025-09-30 22:10:42.757460", + "step": 607, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:42.815434", + "step": 607, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021869728341698647, + "timestamp": "2025-09-30 22:10:42.821966", + "step": 608, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:42.884795", + "step": 608, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018342459574341774, + "timestamp": "2025-09-30 22:10:42.888293", + "step": 609, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:42.944945", + "step": 609, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028766410425305367, + "timestamp": "2025-09-30 22:10:42.947387", + "step": 610, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.005852", + "step": 610, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018140679225325584, + "timestamp": "2025-09-30 22:10:43.009253", + "step": 611, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.067906", + "step": 611, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045255787670612335, + "timestamp": "2025-09-30 22:10:43.074222", + "step": 612, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.128319", + "step": 612, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0265056025236845, + "timestamp": "2025-09-30 22:10:43.132320", + "step": 613, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.198083", + "step": 613, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01379645336419344, + "timestamp": "2025-09-30 22:10:43.203353", + "step": 614, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.263000", + "step": 614, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027471302077174187, + "timestamp": "2025-09-30 22:10:43.269960", + "step": 615, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.333110", + "step": 615, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024327319115400314, + "timestamp": "2025-09-30 22:10:43.346979", + "step": 616, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:43.409256", + "step": 616, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03087471053004265, + "timestamp": "2025-09-30 22:10:43.412560", + "step": 617, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.470023", + "step": 617, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01593298651278019, + "timestamp": "2025-09-30 22:10:43.472721", + "step": 618, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.535159", + "step": 618, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025733623653650284, + "timestamp": "2025-09-30 22:10:43.542048", + "step": 619, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.600543", + "step": 619, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014233306981623173, + "timestamp": "2025-09-30 22:10:43.610569", + "step": 620, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:43.668833", + "step": 620, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016759194433689117, + "timestamp": "2025-09-30 22:10:43.673266", + "step": 621, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:43.732562", + "step": 621, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017879139631986618, + "timestamp": "2025-09-30 22:10:43.734884", + "step": 622, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.790516", + "step": 622, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015650030225515366, + "timestamp": "2025-09-30 22:10:43.793825", + "step": 623, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.854359", + "step": 623, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03968540579080582, + "timestamp": "2025-09-30 22:10:43.860624", + "step": 624, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.915520", + "step": 624, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02014957368373871, + "timestamp": "2025-09-30 22:10:43.919115", + "step": 625, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:43.983591", + "step": 625, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013079083524644375, + "timestamp": "2025-09-30 22:10:43.986020", + "step": 626, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:44.049977", + "step": 626, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008377542719244957, + "timestamp": "2025-09-30 22:10:44.052699", + "step": 627, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:45.478375", + "step": 627, + "epoch": 1 + }, + { + "type": "pplx", + "content": 36910385.758393474, + "timestamp": "2025-09-30 22:10:45.481360", + "step": 627, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.537084", + "step": 627, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00950475875288248, + "timestamp": "2025-09-30 22:10:45.543599", + "step": 628, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.607483", + "step": 628, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04431208595633507, + "timestamp": "2025-09-30 22:10:45.615877", + "step": 629, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.695439", + "step": 629, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03406307473778725, + "timestamp": "2025-09-30 22:10:45.699071", + "step": 630, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.761839", + "step": 630, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020670806989073753, + "timestamp": "2025-09-30 22:10:45.764255", + "step": 631, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:45.826547", + "step": 631, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01893521286547184, + "timestamp": "2025-09-30 22:10:45.833509", + "step": 632, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.888363", + "step": 632, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004280415363609791, + "timestamp": "2025-09-30 22:10:45.892234", + "step": 633, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:45.959896", + "step": 633, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011709028854966164, + "timestamp": "2025-09-30 22:10:45.962642", + "step": 634, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.031711", + "step": 634, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027526134625077248, + "timestamp": "2025-09-30 22:10:46.034267", + "step": 635, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.103348", + "step": 635, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00565726263448596, + "timestamp": "2025-09-30 22:10:46.109439", + "step": 636, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.168041", + "step": 636, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003934341017156839, + "timestamp": "2025-09-30 22:10:46.170929", + "step": 637, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.255593", + "step": 637, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04277295991778374, + "timestamp": "2025-09-30 22:10:46.258249", + "step": 638, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.328094", + "step": 638, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03677859529852867, + "timestamp": "2025-09-30 22:10:46.331919", + "step": 639, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.388579", + "step": 639, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003129987744614482, + "timestamp": "2025-09-30 22:10:46.395424", + "step": 640, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.456612", + "step": 640, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006956647150218487, + "timestamp": "2025-09-30 22:10:46.458826", + "step": 641, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.522331", + "step": 641, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016979986801743507, + "timestamp": "2025-09-30 22:10:46.524475", + "step": 642, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.599315", + "step": 642, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0076353359036147594, + "timestamp": "2025-09-30 22:10:46.602275", + "step": 643, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.662323", + "step": 643, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0387401208281517, + "timestamp": "2025-09-30 22:10:46.668774", + "step": 644, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.750641", + "step": 644, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011696984991431236, + "timestamp": "2025-09-30 22:10:46.752924", + "step": 645, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:46.809448", + "step": 645, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03578943759202957, + "timestamp": "2025-09-30 22:10:46.812025", + "step": 646, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.883057", + "step": 646, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007673552725464106, + "timestamp": "2025-09-30 22:10:46.885598", + "step": 647, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:46.947206", + "step": 647, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0077004688791930676, + "timestamp": "2025-09-30 22:10:46.954082", + "step": 648, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:47.017350", + "step": 648, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009061289019882679, + "timestamp": "2025-09-30 22:10:47.021631", + "step": 649, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:47.077628", + "step": 649, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008708625100553036, + "timestamp": "2025-09-30 22:10:47.080916", + "step": 650, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.135851", + "step": 650, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02237722836434841, + "timestamp": "2025-09-30 22:10:47.138706", + "step": 651, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.193613", + "step": 651, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015081758610904217, + "timestamp": "2025-09-30 22:10:47.204993", + "step": 652, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.261802", + "step": 652, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020020443946123123, + "timestamp": "2025-09-30 22:10:47.264706", + "step": 653, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.325696", + "step": 653, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029474765062332153, + "timestamp": "2025-09-30 22:10:47.335406", + "step": 654, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:47.396809", + "step": 654, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0116732781752944, + "timestamp": "2025-09-30 22:10:47.399639", + "step": 655, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.458257", + "step": 655, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007577078882604837, + "timestamp": "2025-09-30 22:10:47.464479", + "step": 656, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.526732", + "step": 656, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0052547636441886425, + "timestamp": "2025-09-30 22:10:47.529806", + "step": 657, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:47.586913", + "step": 657, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018849587067961693, + "timestamp": "2025-09-30 22:10:47.593218", + "step": 658, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:47.660670", + "step": 658, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009038467891514301, + "timestamp": "2025-09-30 22:10:47.665059", + "step": 659, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.724782", + "step": 659, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01595907285809517, + "timestamp": "2025-09-30 22:10:47.732513", + "step": 660, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.789691", + "step": 660, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013368090614676476, + "timestamp": "2025-09-30 22:10:47.791930", + "step": 661, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.850005", + "step": 661, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006896801292896271, + "timestamp": "2025-09-30 22:10:47.854407", + "step": 662, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:47.916187", + "step": 662, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01652236096560955, + "timestamp": "2025-09-30 22:10:47.918980", + "step": 663, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:47.979786", + "step": 663, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024208780378103256, + "timestamp": "2025-09-30 22:10:47.995948", + "step": 664, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.049264", + "step": 664, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01214011013507843, + "timestamp": "2025-09-30 22:10:48.051480", + "step": 665, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.105936", + "step": 665, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00862074550241232, + "timestamp": "2025-09-30 22:10:48.109173", + "step": 666, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.163901", + "step": 666, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015050886198878288, + "timestamp": "2025-09-30 22:10:48.167571", + "step": 667, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.222532", + "step": 667, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013227096758782864, + "timestamp": "2025-09-30 22:10:48.228725", + "step": 668, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.283259", + "step": 668, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038987185806035995, + "timestamp": "2025-09-30 22:10:48.288928", + "step": 669, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:48.351701", + "step": 669, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04726878181099892, + "timestamp": "2025-09-30 22:10:48.354165", + "step": 670, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.410143", + "step": 670, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008983231149613857, + "timestamp": "2025-09-30 22:10:48.413801", + "step": 671, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.468268", + "step": 671, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018243545666337013, + "timestamp": "2025-09-30 22:10:48.475102", + "step": 672, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.531770", + "step": 672, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02222895435988903, + "timestamp": "2025-09-30 22:10:48.536427", + "step": 673, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.596231", + "step": 673, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02621746063232422, + "timestamp": "2025-09-30 22:10:48.598374", + "step": 674, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.653154", + "step": 674, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04202299192547798, + "timestamp": "2025-09-30 22:10:48.656988", + "step": 675, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.716543", + "step": 675, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014642206020653248, + "timestamp": "2025-09-30 22:10:48.724600", + "step": 676, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.784864", + "step": 676, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020012324675917625, + "timestamp": "2025-09-30 22:10:48.787118", + "step": 677, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:48.846088", + "step": 677, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00840475969016552, + "timestamp": "2025-09-30 22:10:48.848230", + "step": 678, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.902428", + "step": 678, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007752344012260437, + "timestamp": "2025-09-30 22:10:48.909039", + "step": 679, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:48.967559", + "step": 679, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013558789156377316, + "timestamp": "2025-09-30 22:10:48.975134", + "step": 680, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:49.029934", + "step": 680, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010377148166298866, + "timestamp": "2025-09-30 22:10:49.044282", + "step": 681, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:49.099360", + "step": 681, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012168629094958305, + "timestamp": "2025-09-30 22:10:49.101658", + "step": 682, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:49.166872", + "step": 682, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015606013126671314, + "timestamp": "2025-09-30 22:10:49.169829", + "step": 683, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:49.234287", + "step": 683, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009783417917788029, + "timestamp": "2025-09-30 22:10:49.251208", + "step": 684, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:50.679333", + "step": 684, + "epoch": 1 + }, + { + "type": "pplx", + "content": 32421580.472615503, + "timestamp": "2025-09-30 22:10:50.682440", + "step": 684, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:50.737433", + "step": 684, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030063265934586525, + "timestamp": "2025-09-30 22:10:50.740440", + "step": 685, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:50.803382", + "step": 685, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01967502571642399, + "timestamp": "2025-09-30 22:10:50.806394", + "step": 686, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:50.862605", + "step": 686, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008366209454834461, + "timestamp": "2025-09-30 22:10:50.865456", + "step": 687, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:50.922320", + "step": 687, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010231166146695614, + "timestamp": "2025-09-30 22:10:50.928995", + "step": 688, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:50.983225", + "step": 688, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027960339561104774, + "timestamp": "2025-09-30 22:10:50.985402", + "step": 689, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.048020", + "step": 689, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008330179378390312, + "timestamp": "2025-09-30 22:10:51.052493", + "step": 690, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.112208", + "step": 690, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01055544801056385, + "timestamp": "2025-09-30 22:10:51.115301", + "step": 691, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:51.173114", + "step": 691, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010108939372003078, + "timestamp": "2025-09-30 22:10:51.179615", + "step": 692, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.235667", + "step": 692, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021281301975250244, + "timestamp": "2025-09-30 22:10:51.239421", + "step": 693, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:51.294223", + "step": 693, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05024373531341553, + "timestamp": "2025-09-30 22:10:51.296468", + "step": 694, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.351440", + "step": 694, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005921120289713144, + "timestamp": "2025-09-30 22:10:51.354235", + "step": 695, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:51.409886", + "step": 695, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02190905436873436, + "timestamp": "2025-09-30 22:10:51.415774", + "step": 696, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.473821", + "step": 696, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012462352402508259, + "timestamp": "2025-09-30 22:10:51.476265", + "step": 697, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.532028", + "step": 697, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009990805760025978, + "timestamp": "2025-09-30 22:10:51.534677", + "step": 698, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.589732", + "step": 698, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023069290444254875, + "timestamp": "2025-09-30 22:10:51.596286", + "step": 699, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.650913", + "step": 699, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025465210899710655, + "timestamp": "2025-09-30 22:10:51.657683", + "step": 700, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:51.729574", + "step": 700, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007522939704358578, + "timestamp": "2025-09-30 22:10:51.731746", + "step": 701, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:51.786127", + "step": 701, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023840337991714478, + "timestamp": "2025-09-30 22:10:51.788879", + "step": 702, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:51.843789", + "step": 702, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01778830774128437, + "timestamp": "2025-09-30 22:10:51.860122", + "step": 703, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.917114", + "step": 703, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013482254929840565, + "timestamp": "2025-09-30 22:10:51.923966", + "step": 704, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:51.978573", + "step": 704, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018839532509446144, + "timestamp": "2025-09-30 22:10:51.980911", + "step": 705, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:52.038927", + "step": 705, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006275936495512724, + "timestamp": "2025-09-30 22:10:52.041640", + "step": 706, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.096729", + "step": 706, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006527402438223362, + "timestamp": "2025-09-30 22:10:52.099087", + "step": 707, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.152269", + "step": 707, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016934264451265335, + "timestamp": "2025-09-30 22:10:52.158593", + "step": 708, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.217087", + "step": 708, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013991935178637505, + "timestamp": "2025-09-30 22:10:52.220158", + "step": 709, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.273558", + "step": 709, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.043482519686222076, + "timestamp": "2025-09-30 22:10:52.277808", + "step": 710, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:52.335518", + "step": 710, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02595517970621586, + "timestamp": "2025-09-30 22:10:52.338420", + "step": 711, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.398550", + "step": 711, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03393742814660072, + "timestamp": "2025-09-30 22:10:52.404498", + "step": 712, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.467147", + "step": 712, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019353583455085754, + "timestamp": "2025-09-30 22:10:52.470659", + "step": 713, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.529899", + "step": 713, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01128054317086935, + "timestamp": "2025-09-30 22:10:52.535951", + "step": 714, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:52.590776", + "step": 714, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021564552560448647, + "timestamp": "2025-09-30 22:10:52.595225", + "step": 715, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.655656", + "step": 715, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03504403680562973, + "timestamp": "2025-09-30 22:10:52.663444", + "step": 716, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.719328", + "step": 716, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0178972315043211, + "timestamp": "2025-09-30 22:10:52.723225", + "step": 717, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.784679", + "step": 717, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019097017124295235, + "timestamp": "2025-09-30 22:10:52.791581", + "step": 718, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.847779", + "step": 718, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01555024366825819, + "timestamp": "2025-09-30 22:10:52.851233", + "step": 719, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:52.905574", + "step": 719, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03621254116296768, + "timestamp": "2025-09-30 22:10:52.912518", + "step": 720, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:52.965224", + "step": 720, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00840581115335226, + "timestamp": "2025-09-30 22:10:52.968506", + "step": 721, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.026689", + "step": 721, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007782516535371542, + "timestamp": "2025-09-30 22:10:53.030738", + "step": 722, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.085295", + "step": 722, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012187975458800793, + "timestamp": "2025-09-30 22:10:53.089215", + "step": 723, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.144371", + "step": 723, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0311259888112545, + "timestamp": "2025-09-30 22:10:53.150171", + "step": 724, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.212857", + "step": 724, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030853962525725365, + "timestamp": "2025-09-30 22:10:53.216394", + "step": 725, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:53.272679", + "step": 725, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02167673408985138, + "timestamp": "2025-09-30 22:10:53.275493", + "step": 726, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:53.332586", + "step": 726, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014911350794136524, + "timestamp": "2025-09-30 22:10:53.338789", + "step": 727, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.401157", + "step": 727, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009337635710835457, + "timestamp": "2025-09-30 22:10:53.413064", + "step": 728, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.469760", + "step": 728, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007847541943192482, + "timestamp": "2025-09-30 22:10:53.473222", + "step": 729, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.536806", + "step": 729, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021578481420874596, + "timestamp": "2025-09-30 22:10:53.540382", + "step": 730, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.601369", + "step": 730, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009829229675233364, + "timestamp": "2025-09-30 22:10:53.604142", + "step": 731, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.675425", + "step": 731, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022942425683140755, + "timestamp": "2025-09-30 22:10:53.683426", + "step": 732, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:53.740581", + "step": 732, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018000300973653793, + "timestamp": "2025-09-30 22:10:53.743633", + "step": 733, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:53.802944", + "step": 733, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01805448904633522, + "timestamp": "2025-09-30 22:10:53.806262", + "step": 734, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:53.865277", + "step": 734, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01143564097583294, + "timestamp": "2025-09-30 22:10:53.868349", + "step": 735, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:53.929590", + "step": 735, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010203097946941853, + "timestamp": "2025-09-30 22:10:53.935468", + "step": 736, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:54.001180", + "step": 736, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028915002942085266, + "timestamp": "2025-09-30 22:10:54.003750", + "step": 737, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:54.064463", + "step": 737, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01114210207015276, + "timestamp": "2025-09-30 22:10:54.067247", + "step": 738, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:54.121329", + "step": 738, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02133123390376568, + "timestamp": "2025-09-30 22:10:54.124058", + "step": 739, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:54.189872", + "step": 739, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034789033234119415, + "timestamp": "2025-09-30 22:10:54.196789", + "step": 740, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:54.264175", + "step": 740, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029856473207473755, + "timestamp": "2025-09-30 22:10:54.266805", + "step": 741, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:10:55.578341", + "step": 741, + "epoch": 1 + }, + { + "type": "pplx", + "content": 32540290.98788411, + "timestamp": "2025-09-30 22:10:55.580487", + "step": 741, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:55.634594", + "step": 741, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011041047051548958, + "timestamp": "2025-09-30 22:10:55.637911", + "step": 742, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:55.693181", + "step": 742, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013807429000735283, + "timestamp": "2025-09-30 22:10:55.695078", + "step": 743, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:55.751626", + "step": 743, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03291318565607071, + "timestamp": "2025-09-30 22:10:55.757522", + "step": 744, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:55.812977", + "step": 744, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017772836610674858, + "timestamp": "2025-09-30 22:10:55.817366", + "step": 745, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:55.870542", + "step": 745, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010666130110621452, + "timestamp": "2025-09-30 22:10:55.872940", + "step": 746, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:55.927591", + "step": 746, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008871505968272686, + "timestamp": "2025-09-30 22:10:55.929995", + "step": 747, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:55.983417", + "step": 747, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02815295197069645, + "timestamp": "2025-09-30 22:10:55.989362", + "step": 748, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.043054", + "step": 748, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018090158700942993, + "timestamp": "2025-09-30 22:10:56.045234", + "step": 749, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:56.098937", + "step": 749, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012871643528342247, + "timestamp": "2025-09-30 22:10:56.101068", + "step": 750, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:56.155911", + "step": 750, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02255532518029213, + "timestamp": "2025-09-30 22:10:56.167119", + "step": 751, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.225904", + "step": 751, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03662119060754776, + "timestamp": "2025-09-30 22:10:56.231510", + "step": 752, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.285709", + "step": 752, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01848854124546051, + "timestamp": "2025-09-30 22:10:56.289641", + "step": 753, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:56.347354", + "step": 753, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010717155411839485, + "timestamp": "2025-09-30 22:10:56.355255", + "step": 754, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:56.422808", + "step": 754, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014511531218886375, + "timestamp": "2025-09-30 22:10:56.424913", + "step": 755, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:56.482154", + "step": 755, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026509271934628487, + "timestamp": "2025-09-30 22:10:56.489952", + "step": 756, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.549793", + "step": 756, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004727398511022329, + "timestamp": "2025-09-30 22:10:56.553813", + "step": 757, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:56.613752", + "step": 757, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0046147494576871395, + "timestamp": "2025-09-30 22:10:56.620038", + "step": 758, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:56.678010", + "step": 758, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009979399852454662, + "timestamp": "2025-09-30 22:10:56.680933", + "step": 759, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.742069", + "step": 759, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0126791438087821, + "timestamp": "2025-09-30 22:10:56.750293", + "step": 760, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.814648", + "step": 760, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005177278071641922, + "timestamp": "2025-09-30 22:10:56.817288", + "step": 761, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:10:56.881049", + "step": 761, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009649130515754223, + "timestamp": "2025-09-30 22:10:56.886005", + "step": 762, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:56.949170", + "step": 762, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036273036152124405, + "timestamp": "2025-09-30 22:10:56.957677", + "step": 763, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.012209", + "step": 763, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009934181347489357, + "timestamp": "2025-09-30 22:10:57.020192", + "step": 764, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.081575", + "step": 764, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009199343621730804, + "timestamp": "2025-09-30 22:10:57.095822", + "step": 765, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.156959", + "step": 765, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013879785314202309, + "timestamp": "2025-09-30 22:10:57.159436", + "step": 766, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.212583", + "step": 766, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00856445636600256, + "timestamp": "2025-09-30 22:10:57.216284", + "step": 767, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.276051", + "step": 767, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04576727747917175, + "timestamp": "2025-09-30 22:10:57.284101", + "step": 768, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.339006", + "step": 768, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01549856923520565, + "timestamp": "2025-09-30 22:10:57.347060", + "step": 769, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.406865", + "step": 769, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035134777426719666, + "timestamp": "2025-09-30 22:10:57.425285", + "step": 770, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:57.479947", + "step": 770, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027558207511901855, + "timestamp": "2025-09-30 22:10:57.500451", + "step": 771, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:57.572535", + "step": 771, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008749599568545818, + "timestamp": "2025-09-30 22:10:57.578703", + "step": 772, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.632785", + "step": 772, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0587838776409626, + "timestamp": "2025-09-30 22:10:57.634782", + "step": 773, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.702651", + "step": 773, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004769704304635525, + "timestamp": "2025-09-30 22:10:57.704741", + "step": 774, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:57.762540", + "step": 774, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030832087621092796, + "timestamp": "2025-09-30 22:10:57.764624", + "step": 775, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:57.818884", + "step": 775, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02731013298034668, + "timestamp": "2025-09-30 22:10:57.824632", + "step": 776, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:57.877914", + "step": 776, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007288725581020117, + "timestamp": "2025-09-30 22:10:57.880273", + "step": 777, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:57.936165", + "step": 777, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003071850398555398, + "timestamp": "2025-09-30 22:10:57.938334", + "step": 778, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:57.997184", + "step": 778, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02605491690337658, + "timestamp": "2025-09-30 22:10:57.999595", + "step": 779, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.055531", + "step": 779, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006960700731724501, + "timestamp": "2025-09-30 22:10:58.061276", + "step": 780, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.121422", + "step": 780, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018633553758263588, + "timestamp": "2025-09-30 22:10:58.124436", + "step": 781, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:58.186438", + "step": 781, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0156040508300066, + "timestamp": "2025-09-30 22:10:58.188684", + "step": 782, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.241872", + "step": 782, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024231048300862312, + "timestamp": "2025-09-30 22:10:58.244128", + "step": 783, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:58.298950", + "step": 783, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01897078938782215, + "timestamp": "2025-09-30 22:10:58.304930", + "step": 784, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.362591", + "step": 784, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007195747457444668, + "timestamp": "2025-09-30 22:10:58.364851", + "step": 785, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:58.419197", + "step": 785, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004759868141263723, + "timestamp": "2025-09-30 22:10:58.421346", + "step": 786, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.475595", + "step": 786, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006611085496842861, + "timestamp": "2025-09-30 22:10:58.477615", + "step": 787, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.538663", + "step": 787, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024320388212800026, + "timestamp": "2025-09-30 22:10:58.544699", + "step": 788, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.604428", + "step": 788, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01196072157472372, + "timestamp": "2025-09-30 22:10:58.606592", + "step": 789, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:10:58.664400", + "step": 789, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00526924803853035, + "timestamp": "2025-09-30 22:10:58.666685", + "step": 790, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.721426", + "step": 790, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013081463053822517, + "timestamp": "2025-09-30 22:10:58.724083", + "step": 791, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.780350", + "step": 791, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021804455667734146, + "timestamp": "2025-09-30 22:10:58.786275", + "step": 792, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.839769", + "step": 792, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014370249584317207, + "timestamp": "2025-09-30 22:10:58.843237", + "step": 793, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.897409", + "step": 793, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03542889282107353, + "timestamp": "2025-09-30 22:10:58.899546", + "step": 794, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:58.957528", + "step": 794, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014196035452187061, + "timestamp": "2025-09-30 22:10:58.959878", + "step": 795, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:59.013285", + "step": 795, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006897695828229189, + "timestamp": "2025-09-30 22:10:59.019364", + "step": 796, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:10:59.072892", + "step": 796, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009900706820189953, + "timestamp": "2025-09-30 22:10:59.078591", + "step": 797, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:10:59.133963", + "step": 797, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017876118421554565, + "timestamp": "2025-09-30 22:10:59.136126", + "step": 798, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:00.461430", + "step": 798, + "epoch": 1 + }, + { + "type": "pplx", + "content": 35240375.419794545, + "timestamp": "2025-09-30 22:11:00.463470", + "step": 798, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.520434", + "step": 798, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0068840510211884975, + "timestamp": "2025-09-30 22:11:00.522737", + "step": 799, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.575833", + "step": 799, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032674796879291534, + "timestamp": "2025-09-30 22:11:00.583553", + "step": 800, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.636895", + "step": 800, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026194969192147255, + "timestamp": "2025-09-30 22:11:00.640345", + "step": 801, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:00.694348", + "step": 801, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029077323153614998, + "timestamp": "2025-09-30 22:11:00.696603", + "step": 802, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.751803", + "step": 802, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021133294329047203, + "timestamp": "2025-09-30 22:11:00.754628", + "step": 803, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.810255", + "step": 803, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008854121901094913, + "timestamp": "2025-09-30 22:11:00.816284", + "step": 804, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.882520", + "step": 804, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025976279750466347, + "timestamp": "2025-09-30 22:11:00.885485", + "step": 805, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.940450", + "step": 805, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02907080017030239, + "timestamp": "2025-09-30 22:11:00.942441", + "step": 806, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:00.995813", + "step": 806, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009363925084471703, + "timestamp": "2025-09-30 22:11:00.998106", + "step": 807, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.051881", + "step": 807, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017092768102884293, + "timestamp": "2025-09-30 22:11:01.057512", + "step": 808, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:01.111311", + "step": 808, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00928075797855854, + "timestamp": "2025-09-30 22:11:01.113383", + "step": 809, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:01.168876", + "step": 809, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015542738139629364, + "timestamp": "2025-09-30 22:11:01.171649", + "step": 810, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.227333", + "step": 810, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022915521636605263, + "timestamp": "2025-09-30 22:11:01.229426", + "step": 811, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.282472", + "step": 811, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012876084074378014, + "timestamp": "2025-09-30 22:11:01.288399", + "step": 812, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.342096", + "step": 812, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0221566129475832, + "timestamp": "2025-09-30 22:11:01.344402", + "step": 813, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.397809", + "step": 813, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017164286226034164, + "timestamp": "2025-09-30 22:11:01.399891", + "step": 814, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.453987", + "step": 814, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009646688587963581, + "timestamp": "2025-09-30 22:11:01.457594", + "step": 815, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.513635", + "step": 815, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015178002417087555, + "timestamp": "2025-09-30 22:11:01.522945", + "step": 816, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.577810", + "step": 816, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030943872407078743, + "timestamp": "2025-09-30 22:11:01.580407", + "step": 817, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.635707", + "step": 817, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016377057880163193, + "timestamp": "2025-09-30 22:11:01.638511", + "step": 818, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.692527", + "step": 818, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007752551231533289, + "timestamp": "2025-09-30 22:11:01.694955", + "step": 819, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:01.748220", + "step": 819, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011008880101144314, + "timestamp": "2025-09-30 22:11:01.754611", + "step": 820, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:01.808426", + "step": 820, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011034859344363213, + "timestamp": "2025-09-30 22:11:01.813052", + "step": 821, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.869500", + "step": 821, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01996428705751896, + "timestamp": "2025-09-30 22:11:01.873235", + "step": 822, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:01.935142", + "step": 822, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017574485391378403, + "timestamp": "2025-09-30 22:11:01.944753", + "step": 823, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:02.005799", + "step": 823, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024993544444441795, + "timestamp": "2025-09-30 22:11:02.012099", + "step": 824, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:02.066524", + "step": 824, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010080697014927864, + "timestamp": "2025-09-30 22:11:02.072252", + "step": 825, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.126597", + "step": 825, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015512553974986076, + "timestamp": "2025-09-30 22:11:02.129066", + "step": 826, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:02.186535", + "step": 826, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028046919032931328, + "timestamp": "2025-09-30 22:11:02.192835", + "step": 827, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:02.258556", + "step": 827, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01140694972127676, + "timestamp": "2025-09-30 22:11:02.264406", + "step": 828, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.325633", + "step": 828, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01551117654889822, + "timestamp": "2025-09-30 22:11:02.330248", + "step": 829, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.383542", + "step": 829, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005076105706393719, + "timestamp": "2025-09-30 22:11:02.390134", + "step": 830, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.447396", + "step": 830, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007691748905926943, + "timestamp": "2025-09-30 22:11:02.464235", + "step": 831, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.521544", + "step": 831, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008671620860695839, + "timestamp": "2025-09-30 22:11:02.535575", + "step": 832, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.590330", + "step": 832, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031672995537519455, + "timestamp": "2025-09-30 22:11:02.594947", + "step": 833, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:02.650326", + "step": 833, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01112611498683691, + "timestamp": "2025-09-30 22:11:02.662307", + "step": 834, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:02.727931", + "step": 834, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0036083075683563948, + "timestamp": "2025-09-30 22:11:02.731299", + "step": 835, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.789283", + "step": 835, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008130187168717384, + "timestamp": "2025-09-30 22:11:02.799273", + "step": 836, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:02.857425", + "step": 836, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036892350763082504, + "timestamp": "2025-09-30 22:11:02.860090", + "step": 837, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.922647", + "step": 837, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018781444057822227, + "timestamp": "2025-09-30 22:11:02.931661", + "step": 838, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:02.986358", + "step": 838, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029954804107546806, + "timestamp": "2025-09-30 22:11:02.988486", + "step": 839, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:03.047723", + "step": 839, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01475707720965147, + "timestamp": "2025-09-30 22:11:03.054607", + "step": 840, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.117898", + "step": 840, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01812085136771202, + "timestamp": "2025-09-30 22:11:03.120429", + "step": 841, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.174268", + "step": 841, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030853111296892166, + "timestamp": "2025-09-30 22:11:03.176617", + "step": 842, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.235615", + "step": 842, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014583474956452847, + "timestamp": "2025-09-30 22:11:03.237574", + "step": 843, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.290976", + "step": 843, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020900966599583626, + "timestamp": "2025-09-30 22:11:03.301990", + "step": 844, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:03.356268", + "step": 844, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0058554792776703835, + "timestamp": "2025-09-30 22:11:03.358429", + "step": 845, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:03.413993", + "step": 845, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.001921547343954444, + "timestamp": "2025-09-30 22:11:03.416165", + "step": 846, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.470304", + "step": 846, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033496271818876266, + "timestamp": "2025-09-30 22:11:03.472297", + "step": 847, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:03.525170", + "step": 847, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01942761428654194, + "timestamp": "2025-09-30 22:11:03.530403", + "step": 848, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:03.586709", + "step": 848, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003972503822296858, + "timestamp": "2025-09-30 22:11:03.588508", + "step": 849, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.646023", + "step": 849, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00408882787451148, + "timestamp": "2025-09-30 22:11:03.648241", + "step": 850, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.706513", + "step": 850, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00527701573446393, + "timestamp": "2025-09-30 22:11:03.708413", + "step": 851, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:03.762487", + "step": 851, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016006560996174812, + "timestamp": "2025-09-30 22:11:03.768331", + "step": 852, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.821876", + "step": 852, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006566441617906094, + "timestamp": "2025-09-30 22:11:03.823974", + "step": 853, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:03.879460", + "step": 853, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02836592122912407, + "timestamp": "2025-09-30 22:11:03.881726", + "step": 854, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:03.937448", + "step": 854, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006195141933858395, + "timestamp": "2025-09-30 22:11:03.939682", + "step": 855, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:05.130762", + "step": 855, + "epoch": 1 + }, + { + "type": "pplx", + "content": 37193476.96670681, + "timestamp": "2025-09-30 22:11:05.132796", + "step": 855, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.183818", + "step": 855, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005694164428859949, + "timestamp": "2025-09-30 22:11:05.189422", + "step": 856, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.241745", + "step": 856, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012527307495474815, + "timestamp": "2025-09-30 22:11:05.248245", + "step": 857, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:05.301596", + "step": 857, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010429566726088524, + "timestamp": "2025-09-30 22:11:05.303733", + "step": 858, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.357377", + "step": 858, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01283800695091486, + "timestamp": "2025-09-30 22:11:05.359112", + "step": 859, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.412159", + "step": 859, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027922047302126884, + "timestamp": "2025-09-30 22:11:05.417257", + "step": 860, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.470079", + "step": 860, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019196191802620888, + "timestamp": "2025-09-30 22:11:05.471898", + "step": 861, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:05.525280", + "step": 861, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006581494119018316, + "timestamp": "2025-09-30 22:11:05.529503", + "step": 862, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.585152", + "step": 862, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019029339775443077, + "timestamp": "2025-09-30 22:11:05.589783", + "step": 863, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.645567", + "step": 863, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013398184441030025, + "timestamp": "2025-09-30 22:11:05.651621", + "step": 864, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.705265", + "step": 864, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02754964306950569, + "timestamp": "2025-09-30 22:11:05.708449", + "step": 865, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:05.778912", + "step": 865, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006636460777372122, + "timestamp": "2025-09-30 22:11:05.785334", + "step": 866, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.846711", + "step": 866, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0036353315226733685, + "timestamp": "2025-09-30 22:11:05.849779", + "step": 867, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:05.904919", + "step": 867, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011577283963561058, + "timestamp": "2025-09-30 22:11:05.912851", + "step": 868, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:05.976102", + "step": 868, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016597332432866096, + "timestamp": "2025-09-30 22:11:05.980116", + "step": 869, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.039350", + "step": 869, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.044374849647283554, + "timestamp": "2025-09-30 22:11:06.043036", + "step": 870, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.101559", + "step": 870, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005777058191597462, + "timestamp": "2025-09-30 22:11:06.105594", + "step": 871, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.171476", + "step": 871, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.000960107718128711, + "timestamp": "2025-09-30 22:11:06.179702", + "step": 872, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.249129", + "step": 872, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018397826701402664, + "timestamp": "2025-09-30 22:11:06.254591", + "step": 873, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.311920", + "step": 873, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009636670351028442, + "timestamp": "2025-09-30 22:11:06.314872", + "step": 874, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.372349", + "step": 874, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02451557293534279, + "timestamp": "2025-09-30 22:11:06.376520", + "step": 875, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.451044", + "step": 875, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036388419568538666, + "timestamp": "2025-09-30 22:11:06.458139", + "step": 876, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:06.511108", + "step": 876, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00405623484402895, + "timestamp": "2025-09-30 22:11:06.514565", + "step": 877, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.585352", + "step": 877, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00108911597635597, + "timestamp": "2025-09-30 22:11:06.589256", + "step": 878, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.647797", + "step": 878, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040293216705322266, + "timestamp": "2025-09-30 22:11:06.650671", + "step": 879, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:06.705666", + "step": 879, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.053030043840408325, + "timestamp": "2025-09-30 22:11:06.712051", + "step": 880, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.777617", + "step": 880, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029138272628188133, + "timestamp": "2025-09-30 22:11:06.780854", + "step": 881, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.835173", + "step": 881, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04902893677353859, + "timestamp": "2025-09-30 22:11:06.841768", + "step": 882, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:06.908278", + "step": 882, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03171558305621147, + "timestamp": "2025-09-30 22:11:06.910742", + "step": 883, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:06.973977", + "step": 883, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0021508908830583096, + "timestamp": "2025-09-30 22:11:06.979781", + "step": 884, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.038739", + "step": 884, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005451333709061146, + "timestamp": "2025-09-30 22:11:07.043018", + "step": 885, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.103661", + "step": 885, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021850435063242912, + "timestamp": "2025-09-30 22:11:07.118706", + "step": 886, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:07.182709", + "step": 886, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007379130460321903, + "timestamp": "2025-09-30 22:11:07.196464", + "step": 887, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.256055", + "step": 887, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024785201996564865, + "timestamp": "2025-09-30 22:11:07.262829", + "step": 888, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.316559", + "step": 888, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018535610288381577, + "timestamp": "2025-09-30 22:11:07.319236", + "step": 889, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.373407", + "step": 889, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03353133052587509, + "timestamp": "2025-09-30 22:11:07.379519", + "step": 890, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.439250", + "step": 890, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018443452194333076, + "timestamp": "2025-09-30 22:11:07.442180", + "step": 891, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:07.496055", + "step": 891, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018285350874066353, + "timestamp": "2025-09-30 22:11:07.503954", + "step": 892, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:07.557501", + "step": 892, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012829815968871117, + "timestamp": "2025-09-30 22:11:07.567651", + "step": 893, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.630050", + "step": 893, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05475155636668205, + "timestamp": "2025-09-30 22:11:07.641354", + "step": 894, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.705736", + "step": 894, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021429507061839104, + "timestamp": "2025-09-30 22:11:07.720924", + "step": 895, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.786174", + "step": 895, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02315603196620941, + "timestamp": "2025-09-30 22:11:07.796568", + "step": 896, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:07.850106", + "step": 896, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03501684218645096, + "timestamp": "2025-09-30 22:11:07.863817", + "step": 897, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:07.919325", + "step": 897, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013003984466195107, + "timestamp": "2025-09-30 22:11:07.923166", + "step": 898, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:07.980445", + "step": 898, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010267031379044056, + "timestamp": "2025-09-30 22:11:07.984239", + "step": 899, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:08.038645", + "step": 899, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018157100304961205, + "timestamp": "2025-09-30 22:11:08.049287", + "step": 900, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.104784", + "step": 900, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029190553352236748, + "timestamp": "2025-09-30 22:11:08.114701", + "step": 901, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.172726", + "step": 901, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026753077283501625, + "timestamp": "2025-09-30 22:11:08.179576", + "step": 902, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.240865", + "step": 902, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01073368452489376, + "timestamp": "2025-09-30 22:11:08.248813", + "step": 903, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.309720", + "step": 903, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03276833891868591, + "timestamp": "2025-09-30 22:11:08.323380", + "step": 904, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.380159", + "step": 904, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01617128774523735, + "timestamp": "2025-09-30 22:11:08.395340", + "step": 905, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.456218", + "step": 905, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010833910666406155, + "timestamp": "2025-09-30 22:11:08.460300", + "step": 906, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:08.515914", + "step": 906, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027199676260352135, + "timestamp": "2025-09-30 22:11:08.519480", + "step": 907, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.576222", + "step": 907, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018916381523013115, + "timestamp": "2025-09-30 22:11:08.583916", + "step": 908, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:08.641627", + "step": 908, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02062804065644741, + "timestamp": "2025-09-30 22:11:08.644997", + "step": 909, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:08.706192", + "step": 909, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0058442666195333, + "timestamp": "2025-09-30 22:11:08.709737", + "step": 910, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:08.766142", + "step": 910, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01607462391257286, + "timestamp": "2025-09-30 22:11:08.778255", + "step": 911, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:08.836438", + "step": 911, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007725600618869066, + "timestamp": "2025-09-30 22:11:08.842851", + "step": 912, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:10.078308", + "step": 912, + "epoch": 1 + }, + { + "type": "pplx", + "content": 32940359.600703914, + "timestamp": "2025-09-30 22:11:10.090732", + "step": 912, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.153675", + "step": 912, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03332886844873428, + "timestamp": "2025-09-30 22:11:10.157740", + "step": 913, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:10.214129", + "step": 913, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013211602345108986, + "timestamp": "2025-09-30 22:11:10.218989", + "step": 914, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.272896", + "step": 914, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009566955268383026, + "timestamp": "2025-09-30 22:11:10.284016", + "step": 915, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.348896", + "step": 915, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008288740180432796, + "timestamp": "2025-09-30 22:11:10.356530", + "step": 916, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:10.411350", + "step": 916, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028552187606692314, + "timestamp": "2025-09-30 22:11:10.421464", + "step": 917, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:10.483632", + "step": 917, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04483124613761902, + "timestamp": "2025-09-30 22:11:10.487570", + "step": 918, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.541429", + "step": 918, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019391506910324097, + "timestamp": "2025-09-30 22:11:10.557400", + "step": 919, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.611653", + "step": 919, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.054323118180036545, + "timestamp": "2025-09-30 22:11:10.629999", + "step": 920, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.691250", + "step": 920, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02535811997950077, + "timestamp": "2025-09-30 22:11:10.698154", + "step": 921, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.752424", + "step": 921, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04979207366704941, + "timestamp": "2025-09-30 22:11:10.756092", + "step": 922, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.811296", + "step": 922, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02441842295229435, + "timestamp": "2025-09-30 22:11:10.814036", + "step": 923, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:10.867564", + "step": 923, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04180010035634041, + "timestamp": "2025-09-30 22:11:10.874041", + "step": 924, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.927346", + "step": 924, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01361654233187437, + "timestamp": "2025-09-30 22:11:10.937917", + "step": 925, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:10.993247", + "step": 925, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003717446932569146, + "timestamp": "2025-09-30 22:11:10.997274", + "step": 926, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.051890", + "step": 926, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03469868749380112, + "timestamp": "2025-09-30 22:11:11.055014", + "step": 927, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.109765", + "step": 927, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022886861115694046, + "timestamp": "2025-09-30 22:11:11.116739", + "step": 928, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.176655", + "step": 928, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005226781126111746, + "timestamp": "2025-09-30 22:11:11.179549", + "step": 929, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.233548", + "step": 929, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017794528976082802, + "timestamp": "2025-09-30 22:11:11.237401", + "step": 930, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.291906", + "step": 930, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034500379115343094, + "timestamp": "2025-09-30 22:11:11.294980", + "step": 931, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.349397", + "step": 931, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030988583341240883, + "timestamp": "2025-09-30 22:11:11.355998", + "step": 932, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.410976", + "step": 932, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017014091834425926, + "timestamp": "2025-09-30 22:11:11.414367", + "step": 933, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.468837", + "step": 933, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010941265150904655, + "timestamp": "2025-09-30 22:11:11.479844", + "step": 934, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.536379", + "step": 934, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01286663394421339, + "timestamp": "2025-09-30 22:11:11.539828", + "step": 935, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:11.595693", + "step": 935, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015257543884217739, + "timestamp": "2025-09-30 22:11:11.602568", + "step": 936, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.656340", + "step": 936, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021351372823119164, + "timestamp": "2025-09-30 22:11:11.659197", + "step": 937, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:11.727179", + "step": 937, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02581597864627838, + "timestamp": "2025-09-30 22:11:11.731138", + "step": 938, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.786742", + "step": 938, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026747452095150948, + "timestamp": "2025-09-30 22:11:11.790349", + "step": 939, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.845472", + "step": 939, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017408985644578934, + "timestamp": "2025-09-30 22:11:11.851936", + "step": 940, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.905233", + "step": 940, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023534327745437622, + "timestamp": "2025-09-30 22:11:11.907889", + "step": 941, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:11.963045", + "step": 941, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018282432109117508, + "timestamp": "2025-09-30 22:11:11.968359", + "step": 942, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:12.025145", + "step": 942, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016469817608594894, + "timestamp": "2025-09-30 22:11:12.029379", + "step": 943, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:12.085252", + "step": 943, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014783737249672413, + "timestamp": "2025-09-30 22:11:12.099785", + "step": 944, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.160940", + "step": 944, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024034133180975914, + "timestamp": "2025-09-30 22:11:12.164325", + "step": 945, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.217753", + "step": 945, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021934911608695984, + "timestamp": "2025-09-30 22:11:12.220707", + "step": 946, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.280206", + "step": 946, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020119894295930862, + "timestamp": "2025-09-30 22:11:12.283544", + "step": 947, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.337534", + "step": 947, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023923667147755623, + "timestamp": "2025-09-30 22:11:12.343895", + "step": 948, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.398463", + "step": 948, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022407446056604385, + "timestamp": "2025-09-30 22:11:12.402721", + "step": 949, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.456921", + "step": 949, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013444255106151104, + "timestamp": "2025-09-30 22:11:12.467226", + "step": 950, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.530025", + "step": 950, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027343014255166054, + "timestamp": "2025-09-30 22:11:12.542380", + "step": 951, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.595965", + "step": 951, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01585647463798523, + "timestamp": "2025-09-30 22:11:12.612721", + "step": 952, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.666604", + "step": 952, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021314824000000954, + "timestamp": "2025-09-30 22:11:12.669789", + "step": 953, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.723426", + "step": 953, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025206713005900383, + "timestamp": "2025-09-30 22:11:12.738101", + "step": 954, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.791302", + "step": 954, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019110916182398796, + "timestamp": "2025-09-30 22:11:12.794999", + "step": 955, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:12.850074", + "step": 955, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017940467223525047, + "timestamp": "2025-09-30 22:11:12.857144", + "step": 956, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.912948", + "step": 956, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01818234659731388, + "timestamp": "2025-09-30 22:11:12.916307", + "step": 957, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:12.971346", + "step": 957, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0146263986825943, + "timestamp": "2025-09-30 22:11:12.975228", + "step": 958, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:13.028830", + "step": 958, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01784416474401951, + "timestamp": "2025-09-30 22:11:13.032715", + "step": 959, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.087234", + "step": 959, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01846710965037346, + "timestamp": "2025-09-30 22:11:13.100827", + "step": 960, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:13.162630", + "step": 960, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006946962792426348, + "timestamp": "2025-09-30 22:11:13.165285", + "step": 961, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.223612", + "step": 961, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013810682110488415, + "timestamp": "2025-09-30 22:11:13.227015", + "step": 962, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.281240", + "step": 962, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011714011430740356, + "timestamp": "2025-09-30 22:11:13.284618", + "step": 963, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.338804", + "step": 963, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01336191687732935, + "timestamp": "2025-09-30 22:11:13.344647", + "step": 964, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.399856", + "step": 964, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0091180969029665, + "timestamp": "2025-09-30 22:11:13.407761", + "step": 965, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.462252", + "step": 965, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006548265926539898, + "timestamp": "2025-09-30 22:11:13.472964", + "step": 966, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:13.528676", + "step": 966, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02103179506957531, + "timestamp": "2025-09-30 22:11:13.531427", + "step": 967, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.585497", + "step": 967, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010586266405880451, + "timestamp": "2025-09-30 22:11:13.598192", + "step": 968, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:13.651417", + "step": 968, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03246624767780304, + "timestamp": "2025-09-30 22:11:13.654241", + "step": 969, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:14.864683", + "step": 969, + "epoch": 2 + }, + { + "type": "pplx", + "content": 28798242.8768259, + "timestamp": "2025-09-30 22:11:14.874517", + "step": 969, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:14.929301", + "step": 969, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02270360477268696, + "timestamp": "2025-09-30 22:11:14.932975", + "step": 970, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:14.989521", + "step": 970, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023568162694573402, + "timestamp": "2025-09-30 22:11:14.993038", + "step": 971, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.047908", + "step": 971, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014281951822340488, + "timestamp": "2025-09-30 22:11:15.056516", + "step": 972, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.115587", + "step": 972, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02344512939453125, + "timestamp": "2025-09-30 22:11:15.118054", + "step": 973, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.171272", + "step": 973, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015097950585186481, + "timestamp": "2025-09-30 22:11:15.180781", + "step": 974, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.246952", + "step": 974, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015720214694738388, + "timestamp": "2025-09-30 22:11:15.250015", + "step": 975, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.307234", + "step": 975, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017806116491556168, + "timestamp": "2025-09-30 22:11:15.320373", + "step": 976, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.379570", + "step": 976, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028563665226101875, + "timestamp": "2025-09-30 22:11:15.383120", + "step": 977, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.437607", + "step": 977, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01088719256222248, + "timestamp": "2025-09-30 22:11:15.449621", + "step": 978, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.509702", + "step": 978, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01909536123275757, + "timestamp": "2025-09-30 22:11:15.523252", + "step": 979, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.588137", + "step": 979, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0036833793856203556, + "timestamp": "2025-09-30 22:11:15.594879", + "step": 980, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.648809", + "step": 980, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0032408982515335083, + "timestamp": "2025-09-30 22:11:15.651580", + "step": 981, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.713924", + "step": 981, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031015234999358654, + "timestamp": "2025-09-30 22:11:15.727931", + "step": 982, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.785197", + "step": 982, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03271304816007614, + "timestamp": "2025-09-30 22:11:15.788925", + "step": 983, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.844036", + "step": 983, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004379452671855688, + "timestamp": "2025-09-30 22:11:15.851440", + "step": 984, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:15.905298", + "step": 984, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027334626764059067, + "timestamp": "2025-09-30 22:11:15.909656", + "step": 985, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:15.969612", + "step": 985, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03557107597589493, + "timestamp": "2025-09-30 22:11:15.973229", + "step": 986, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.028026", + "step": 986, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02836051769554615, + "timestamp": "2025-09-30 22:11:16.030794", + "step": 987, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.086426", + "step": 987, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0021814878564327955, + "timestamp": "2025-09-30 22:11:16.100933", + "step": 988, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.162657", + "step": 988, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.045109979808330536, + "timestamp": "2025-09-30 22:11:16.165248", + "step": 989, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:16.221746", + "step": 989, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008987918496131897, + "timestamp": "2025-09-30 22:11:16.231912", + "step": 990, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.293739", + "step": 990, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05426352098584175, + "timestamp": "2025-09-30 22:11:16.297126", + "step": 991, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.352232", + "step": 991, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023926403373479843, + "timestamp": "2025-09-30 22:11:16.365565", + "step": 992, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:16.430875", + "step": 992, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021941417828202248, + "timestamp": "2025-09-30 22:11:16.439585", + "step": 993, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.498898", + "step": 993, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022387471050024033, + "timestamp": "2025-09-30 22:11:16.503554", + "step": 994, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.559981", + "step": 994, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034199006855487823, + "timestamp": "2025-09-30 22:11:16.563770", + "step": 995, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.626143", + "step": 995, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03138425573706627, + "timestamp": "2025-09-30 22:11:16.633281", + "step": 996, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:16.699866", + "step": 996, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01781405135989189, + "timestamp": "2025-09-30 22:11:16.711858", + "step": 997, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.774422", + "step": 997, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01842852309346199, + "timestamp": "2025-09-30 22:11:16.778088", + "step": 998, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.837439", + "step": 998, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015847353264689445, + "timestamp": "2025-09-30 22:11:16.840628", + "step": 999, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:16.905813", + "step": 999, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023063872009515762, + "timestamp": "2025-09-30 22:11:16.922447", + "step": 1000, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1000", + "timestamp": "2025-09-30 22:11:17.342366", + "step": 1000, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.406168", + "step": 1000, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01709248684346676, + "timestamp": "2025-09-30 22:11:17.410020", + "step": 1001, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.475350", + "step": 1001, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020289698615670204, + "timestamp": "2025-09-30 22:11:17.479027", + "step": 1002, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.537016", + "step": 1002, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008975411765277386, + "timestamp": "2025-09-30 22:11:17.540174", + "step": 1003, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.612218", + "step": 1003, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008156144060194492, + "timestamp": "2025-09-30 22:11:17.625039", + "step": 1004, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.681030", + "step": 1004, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01333966851234436, + "timestamp": "2025-09-30 22:11:17.684917", + "step": 1005, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.739488", + "step": 1005, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017511749640107155, + "timestamp": "2025-09-30 22:11:17.749395", + "step": 1006, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:17.812097", + "step": 1006, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015036171302199364, + "timestamp": "2025-09-30 22:11:17.814729", + "step": 1007, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.870282", + "step": 1007, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027377020567655563, + "timestamp": "2025-09-30 22:11:17.884203", + "step": 1008, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:17.939068", + "step": 1008, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025792699307203293, + "timestamp": "2025-09-30 22:11:17.941916", + "step": 1009, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:17.997774", + "step": 1009, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014380039647221565, + "timestamp": "2025-09-30 22:11:18.000089", + "step": 1010, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.061728", + "step": 1010, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010104767978191376, + "timestamp": "2025-09-30 22:11:18.065231", + "step": 1011, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.119492", + "step": 1011, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026348626241087914, + "timestamp": "2025-09-30 22:11:18.136369", + "step": 1012, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.198516", + "step": 1012, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04897860437631607, + "timestamp": "2025-09-30 22:11:18.202468", + "step": 1013, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.256848", + "step": 1013, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022462058812379837, + "timestamp": "2025-09-30 22:11:18.260532", + "step": 1014, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.315123", + "step": 1014, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01851494610309601, + "timestamp": "2025-09-30 22:11:18.318588", + "step": 1015, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.372184", + "step": 1015, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01477868389338255, + "timestamp": "2025-09-30 22:11:18.380950", + "step": 1016, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.435393", + "step": 1016, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024721411988139153, + "timestamp": "2025-09-30 22:11:18.439119", + "step": 1017, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.494363", + "step": 1017, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01894466020166874, + "timestamp": "2025-09-30 22:11:18.497219", + "step": 1018, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.558038", + "step": 1018, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02108973078429699, + "timestamp": "2025-09-30 22:11:18.562002", + "step": 1019, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.626079", + "step": 1019, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014491016045212746, + "timestamp": "2025-09-30 22:11:18.639370", + "step": 1020, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.695578", + "step": 1020, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01586849056184292, + "timestamp": "2025-09-30 22:11:18.704763", + "step": 1021, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.758234", + "step": 1021, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024214720353484154, + "timestamp": "2025-09-30 22:11:18.761605", + "step": 1022, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.815061", + "step": 1022, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01589794084429741, + "timestamp": "2025-09-30 22:11:18.820225", + "step": 1023, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:18.875416", + "step": 1023, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013331546448171139, + "timestamp": "2025-09-30 22:11:18.882243", + "step": 1024, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:18.936663", + "step": 1024, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024225672706961632, + "timestamp": "2025-09-30 22:11:18.940125", + "step": 1025, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:18.996413", + "step": 1025, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025553515180945396, + "timestamp": "2025-09-30 22:11:19.013028", + "step": 1026, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:20.209695", + "step": 1026, + "epoch": 2 + }, + { + "type": "pplx", + "content": 28815333.87535932, + "timestamp": "2025-09-30 22:11:20.215325", + "step": 1026, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:20.269869", + "step": 1026, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03137680143117905, + "timestamp": "2025-09-30 22:11:20.275244", + "step": 1027, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:20.330426", + "step": 1027, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012752371840178967, + "timestamp": "2025-09-30 22:11:20.338824", + "step": 1028, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:20.392801", + "step": 1028, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01019949372857809, + "timestamp": "2025-09-30 22:11:20.395750", + "step": 1029, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:20.449777", + "step": 1029, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00877032708376646, + "timestamp": "2025-09-30 22:11:20.454485", + "step": 1030, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:20.510207", + "step": 1030, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011759008280932903, + "timestamp": "2025-09-30 22:11:20.513472", + "step": 1031, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:20.567688", + "step": 1031, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011547918431460857, + "timestamp": "2025-09-30 22:11:20.574407", + "step": 1032, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:20.629320", + "step": 1032, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010614079423248768, + "timestamp": "2025-09-30 22:11:20.631833", + "step": 1033, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:20.693831", + "step": 1033, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021462971344590187, + "timestamp": "2025-09-30 22:11:20.698081", + "step": 1034, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:20.752860", + "step": 1034, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012604753486812115, + "timestamp": "2025-09-30 22:11:20.755848", + "step": 1035, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:20.811454", + "step": 1035, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0071957902982831, + "timestamp": "2025-09-30 22:11:20.818646", + "step": 1036, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:20.871794", + "step": 1036, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007674416061490774, + "timestamp": "2025-09-30 22:11:20.875567", + "step": 1037, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:20.930788", + "step": 1037, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030532391741871834, + "timestamp": "2025-09-30 22:11:20.935486", + "step": 1038, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:20.997287", + "step": 1038, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020820874720811844, + "timestamp": "2025-09-30 22:11:21.000735", + "step": 1039, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:21.056898", + "step": 1039, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02545594982802868, + "timestamp": "2025-09-30 22:11:21.074256", + "step": 1040, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.132305", + "step": 1040, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009167616255581379, + "timestamp": "2025-09-30 22:11:21.135789", + "step": 1041, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.193450", + "step": 1041, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009028018452227116, + "timestamp": "2025-09-30 22:11:21.197632", + "step": 1042, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.257441", + "step": 1042, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05561920627951622, + "timestamp": "2025-09-30 22:11:21.261241", + "step": 1043, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:21.317907", + "step": 1043, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010370013304054737, + "timestamp": "2025-09-30 22:11:21.325239", + "step": 1044, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:21.389422", + "step": 1044, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00789992231875658, + "timestamp": "2025-09-30 22:11:21.396542", + "step": 1045, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.458698", + "step": 1045, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015907449647784233, + "timestamp": "2025-09-30 22:11:21.461609", + "step": 1046, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.522637", + "step": 1046, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03674934431910515, + "timestamp": "2025-09-30 22:11:21.525264", + "step": 1047, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.583737", + "step": 1047, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01920946128666401, + "timestamp": "2025-09-30 22:11:21.590229", + "step": 1048, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:21.645190", + "step": 1048, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009556346572935581, + "timestamp": "2025-09-30 22:11:21.653972", + "step": 1049, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:21.718981", + "step": 1049, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007382354233413935, + "timestamp": "2025-09-30 22:11:21.723397", + "step": 1050, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.787348", + "step": 1050, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013103391043841839, + "timestamp": "2025-09-30 22:11:21.796826", + "step": 1051, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:21.859537", + "step": 1051, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012127166613936424, + "timestamp": "2025-09-30 22:11:21.870499", + "step": 1052, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:21.929892", + "step": 1052, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015366926789283752, + "timestamp": "2025-09-30 22:11:21.938332", + "step": 1053, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:21.999245", + "step": 1053, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014787280932068825, + "timestamp": "2025-09-30 22:11:22.008025", + "step": 1054, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.068794", + "step": 1054, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03266071528196335, + "timestamp": "2025-09-30 22:11:22.072613", + "step": 1055, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.132559", + "step": 1055, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02072693593800068, + "timestamp": "2025-09-30 22:11:22.144141", + "step": 1056, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.204106", + "step": 1056, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011774016544222832, + "timestamp": "2025-09-30 22:11:22.213022", + "step": 1057, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.273981", + "step": 1057, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016940701752901077, + "timestamp": "2025-09-30 22:11:22.279224", + "step": 1058, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.337089", + "step": 1058, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022047163918614388, + "timestamp": "2025-09-30 22:11:22.347032", + "step": 1059, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.420003", + "step": 1059, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011164650321006775, + "timestamp": "2025-09-30 22:11:22.433830", + "step": 1060, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.504073", + "step": 1060, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011306509375572205, + "timestamp": "2025-09-30 22:11:22.506988", + "step": 1061, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.569632", + "step": 1061, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012297945097088814, + "timestamp": "2025-09-30 22:11:22.578635", + "step": 1062, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.642090", + "step": 1062, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005849027074873447, + "timestamp": "2025-09-30 22:11:22.645365", + "step": 1063, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.702258", + "step": 1063, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009956971742212772, + "timestamp": "2025-09-30 22:11:22.708813", + "step": 1064, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.767276", + "step": 1064, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02906261757016182, + "timestamp": "2025-09-30 22:11:22.770910", + "step": 1065, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:22.831779", + "step": 1065, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034898433834314346, + "timestamp": "2025-09-30 22:11:22.842828", + "step": 1066, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.919431", + "step": 1066, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02677224576473236, + "timestamp": "2025-09-30 22:11:22.922878", + "step": 1067, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:22.981516", + "step": 1067, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011528550647199154, + "timestamp": "2025-09-30 22:11:22.990009", + "step": 1068, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.046267", + "step": 1068, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005817048251628876, + "timestamp": "2025-09-30 22:11:23.054922", + "step": 1069, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:23.110207", + "step": 1069, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014719395898282528, + "timestamp": "2025-09-30 22:11:23.113003", + "step": 1070, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:23.170356", + "step": 1070, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02249346859753132, + "timestamp": "2025-09-30 22:11:23.173044", + "step": 1071, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.226950", + "step": 1071, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035828422755002975, + "timestamp": "2025-09-30 22:11:23.233136", + "step": 1072, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.286002", + "step": 1072, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010695637203752995, + "timestamp": "2025-09-30 22:11:23.288722", + "step": 1073, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.345359", + "step": 1073, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0401848740875721, + "timestamp": "2025-09-30 22:11:23.352471", + "step": 1074, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.413270", + "step": 1074, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05746329203248024, + "timestamp": "2025-09-30 22:11:23.416088", + "step": 1075, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:23.479871", + "step": 1075, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04341733828186989, + "timestamp": "2025-09-30 22:11:23.494593", + "step": 1076, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.552516", + "step": 1076, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02323761023581028, + "timestamp": "2025-09-30 22:11:23.555673", + "step": 1077, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.610535", + "step": 1077, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02646273747086525, + "timestamp": "2025-09-30 22:11:23.617313", + "step": 1078, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.679251", + "step": 1078, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017116302624344826, + "timestamp": "2025-09-30 22:11:23.684229", + "step": 1079, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.739927", + "step": 1079, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018749186769127846, + "timestamp": "2025-09-30 22:11:23.757600", + "step": 1080, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:23.813780", + "step": 1080, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025141606107354164, + "timestamp": "2025-09-30 22:11:23.828231", + "step": 1081, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:23.890960", + "step": 1081, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026854228228330612, + "timestamp": "2025-09-30 22:11:23.895100", + "step": 1082, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:23.953254", + "step": 1082, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019390176981687546, + "timestamp": "2025-09-30 22:11:23.956489", + "step": 1083, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:25.186778", + "step": 1083, + "epoch": 2 + }, + { + "type": "pplx", + "content": 30432312.81053753, + "timestamp": "2025-09-30 22:11:25.198564", + "step": 1083, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.261335", + "step": 1083, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01611759141087532, + "timestamp": "2025-09-30 22:11:25.269310", + "step": 1084, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.325196", + "step": 1084, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006585521157830954, + "timestamp": "2025-09-30 22:11:25.329013", + "step": 1085, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.389748", + "step": 1085, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01776345819234848, + "timestamp": "2025-09-30 22:11:25.393830", + "step": 1086, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.449930", + "step": 1086, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02161712571978569, + "timestamp": "2025-09-30 22:11:25.462318", + "step": 1087, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:25.525802", + "step": 1087, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008740507997572422, + "timestamp": "2025-09-30 22:11:25.533504", + "step": 1088, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:25.591122", + "step": 1088, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03131970018148422, + "timestamp": "2025-09-30 22:11:25.595109", + "step": 1089, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.649810", + "step": 1089, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021836699917912483, + "timestamp": "2025-09-30 22:11:25.652709", + "step": 1090, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:25.715906", + "step": 1090, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02508729137480259, + "timestamp": "2025-09-30 22:11:25.727891", + "step": 1091, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:25.791875", + "step": 1091, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013201175257563591, + "timestamp": "2025-09-30 22:11:25.799459", + "step": 1092, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:25.854161", + "step": 1092, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025001246482133865, + "timestamp": "2025-09-30 22:11:25.860494", + "step": 1093, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:25.918284", + "step": 1093, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018011152744293213, + "timestamp": "2025-09-30 22:11:25.922173", + "step": 1094, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:25.977889", + "step": 1094, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015894226729869843, + "timestamp": "2025-09-30 22:11:25.980986", + "step": 1095, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.037531", + "step": 1095, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008476397953927517, + "timestamp": "2025-09-30 22:11:26.044321", + "step": 1096, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:26.097394", + "step": 1096, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017200523987412453, + "timestamp": "2025-09-30 22:11:26.100930", + "step": 1097, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.158557", + "step": 1097, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020911503583192825, + "timestamp": "2025-09-30 22:11:26.161645", + "step": 1098, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.216231", + "step": 1098, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0157220046967268, + "timestamp": "2025-09-30 22:11:26.228058", + "step": 1099, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.283086", + "step": 1099, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020792776718735695, + "timestamp": "2025-09-30 22:11:26.291051", + "step": 1100, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:26.345330", + "step": 1100, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026384010910987854, + "timestamp": "2025-09-30 22:11:26.349217", + "step": 1101, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:26.404260", + "step": 1101, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00842324923723936, + "timestamp": "2025-09-30 22:11:26.408124", + "step": 1102, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.468374", + "step": 1102, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018321281298995018, + "timestamp": "2025-09-30 22:11:26.480784", + "step": 1103, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.545071", + "step": 1103, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016511019319295883, + "timestamp": "2025-09-30 22:11:26.556678", + "step": 1104, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.610703", + "step": 1104, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010460903868079185, + "timestamp": "2025-09-30 22:11:26.622778", + "step": 1105, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:26.687316", + "step": 1105, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00898995902389288, + "timestamp": "2025-09-30 22:11:26.690735", + "step": 1106, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.759562", + "step": 1106, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027517573907971382, + "timestamp": "2025-09-30 22:11:26.764616", + "step": 1107, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:26.821596", + "step": 1107, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021612342447042465, + "timestamp": "2025-09-30 22:11:26.839263", + "step": 1108, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:26.895507", + "step": 1108, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009821252897381783, + "timestamp": "2025-09-30 22:11:26.899450", + "step": 1109, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:26.955573", + "step": 1109, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018084069713950157, + "timestamp": "2025-09-30 22:11:26.958415", + "step": 1110, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.012521", + "step": 1110, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014403236098587513, + "timestamp": "2025-09-30 22:11:27.016372", + "step": 1111, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.070513", + "step": 1111, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02946326695382595, + "timestamp": "2025-09-30 22:11:27.085522", + "step": 1112, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.139790", + "step": 1112, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01968367025256157, + "timestamp": "2025-09-30 22:11:27.143655", + "step": 1113, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.200252", + "step": 1113, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00848131999373436, + "timestamp": "2025-09-30 22:11:27.204535", + "step": 1114, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.268243", + "step": 1114, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021036352962255478, + "timestamp": "2025-09-30 22:11:27.271299", + "step": 1115, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:27.325982", + "step": 1115, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023403389379382133, + "timestamp": "2025-09-30 22:11:27.332962", + "step": 1116, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.396211", + "step": 1116, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023917311802506447, + "timestamp": "2025-09-30 22:11:27.401426", + "step": 1117, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.455716", + "step": 1117, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013939259573817253, + "timestamp": "2025-09-30 22:11:27.458932", + "step": 1118, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.515666", + "step": 1118, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0158989354968071, + "timestamp": "2025-09-30 22:11:27.519650", + "step": 1119, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.575322", + "step": 1119, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02595258131623268, + "timestamp": "2025-09-30 22:11:27.583748", + "step": 1120, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:27.640282", + "step": 1120, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021582497283816338, + "timestamp": "2025-09-30 22:11:27.644484", + "step": 1121, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:27.699420", + "step": 1121, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005752278957515955, + "timestamp": "2025-09-30 22:11:27.704823", + "step": 1122, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.765007", + "step": 1122, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010556980967521667, + "timestamp": "2025-09-30 22:11:27.767848", + "step": 1123, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.823206", + "step": 1123, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01841617561876774, + "timestamp": "2025-09-30 22:11:27.835200", + "step": 1124, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.898988", + "step": 1124, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00747791538015008, + "timestamp": "2025-09-30 22:11:27.903636", + "step": 1125, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:27.958240", + "step": 1125, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023193703964352608, + "timestamp": "2025-09-30 22:11:27.969994", + "step": 1126, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.033276", + "step": 1126, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03435632213950157, + "timestamp": "2025-09-30 22:11:28.037037", + "step": 1127, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.092858", + "step": 1127, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006761268712580204, + "timestamp": "2025-09-30 22:11:28.109904", + "step": 1128, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.175862", + "step": 1128, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.055748604238033295, + "timestamp": "2025-09-30 22:11:28.189431", + "step": 1129, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.252544", + "step": 1129, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035190846771001816, + "timestamp": "2025-09-30 22:11:28.257262", + "step": 1130, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.322185", + "step": 1130, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01312336977571249, + "timestamp": "2025-09-30 22:11:28.334537", + "step": 1131, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.400136", + "step": 1131, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01045858021825552, + "timestamp": "2025-09-30 22:11:28.409011", + "step": 1132, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.471113", + "step": 1132, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028751153498888016, + "timestamp": "2025-09-30 22:11:28.483658", + "step": 1133, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:28.550451", + "step": 1133, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01273017842322588, + "timestamp": "2025-09-30 22:11:28.563256", + "step": 1134, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:28.619462", + "step": 1134, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0197049081325531, + "timestamp": "2025-09-30 22:11:28.623295", + "step": 1135, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.688139", + "step": 1135, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007282760459929705, + "timestamp": "2025-09-30 22:11:28.695520", + "step": 1136, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.766569", + "step": 1136, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007011772133409977, + "timestamp": "2025-09-30 22:11:28.778955", + "step": 1137, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.834229", + "step": 1137, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01817786693572998, + "timestamp": "2025-09-30 22:11:28.838713", + "step": 1138, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.894352", + "step": 1138, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009522279724478722, + "timestamp": "2025-09-30 22:11:28.904447", + "step": 1139, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:28.958831", + "step": 1139, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015000018291175365, + "timestamp": "2025-09-30 22:11:28.976470", + "step": 1140, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:30.202755", + "step": 1140, + "epoch": 2 + }, + { + "type": "pplx", + "content": 31761433.22399976, + "timestamp": "2025-09-30 22:11:30.207738", + "step": 1140, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.261288", + "step": 1140, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023212244734168053, + "timestamp": "2025-09-30 22:11:30.264791", + "step": 1141, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.320013", + "step": 1141, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005753274541348219, + "timestamp": "2025-09-30 22:11:30.324165", + "step": 1142, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.383960", + "step": 1142, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016708120703697205, + "timestamp": "2025-09-30 22:11:30.388811", + "step": 1143, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.453550", + "step": 1143, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03149080649018288, + "timestamp": "2025-09-30 22:11:30.460546", + "step": 1144, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:30.515308", + "step": 1144, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0065864152275025845, + "timestamp": "2025-09-30 22:11:30.519706", + "step": 1145, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:30.575127", + "step": 1145, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015503957867622375, + "timestamp": "2025-09-30 22:11:30.578397", + "step": 1146, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.633640", + "step": 1146, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011548278853297234, + "timestamp": "2025-09-30 22:11:30.639347", + "step": 1147, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.695135", + "step": 1147, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014130154624581337, + "timestamp": "2025-09-30 22:11:30.710275", + "step": 1148, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.764519", + "step": 1148, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022189956158399582, + "timestamp": "2025-09-30 22:11:30.769789", + "step": 1149, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:30.824056", + "step": 1149, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01809052750468254, + "timestamp": "2025-09-30 22:11:30.828934", + "step": 1150, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.884363", + "step": 1150, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0377984382212162, + "timestamp": "2025-09-30 22:11:30.888947", + "step": 1151, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:30.943518", + "step": 1151, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014366830699145794, + "timestamp": "2025-09-30 22:11:30.953335", + "step": 1152, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.007608", + "step": 1152, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019895801320672035, + "timestamp": "2025-09-30 22:11:31.017945", + "step": 1153, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:31.079661", + "step": 1153, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01277296245098114, + "timestamp": "2025-09-30 22:11:31.086770", + "step": 1154, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.140713", + "step": 1154, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027852704748511314, + "timestamp": "2025-09-30 22:11:31.146979", + "step": 1155, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.200899", + "step": 1155, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01802685298025608, + "timestamp": "2025-09-30 22:11:31.207518", + "step": 1156, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.261180", + "step": 1156, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0185939259827137, + "timestamp": "2025-09-30 22:11:31.264827", + "step": 1157, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:31.321946", + "step": 1157, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00952514261007309, + "timestamp": "2025-09-30 22:11:31.325302", + "step": 1158, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.380426", + "step": 1158, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014004341326653957, + "timestamp": "2025-09-30 22:11:31.383957", + "step": 1159, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.437605", + "step": 1159, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011020747944712639, + "timestamp": "2025-09-30 22:11:31.444549", + "step": 1160, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:31.508814", + "step": 1160, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007963932119309902, + "timestamp": "2025-09-30 22:11:31.512980", + "step": 1161, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.568675", + "step": 1161, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029965534806251526, + "timestamp": "2025-09-30 22:11:31.574911", + "step": 1162, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.644846", + "step": 1162, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012620776891708374, + "timestamp": "2025-09-30 22:11:31.658105", + "step": 1163, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:31.715055", + "step": 1163, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017773713916540146, + "timestamp": "2025-09-30 22:11:31.722580", + "step": 1164, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.776212", + "step": 1164, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015509136021137238, + "timestamp": "2025-09-30 22:11:31.780116", + "step": 1165, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.834854", + "step": 1165, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01890859194099903, + "timestamp": "2025-09-30 22:11:31.837981", + "step": 1166, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:31.904513", + "step": 1166, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010066986083984375, + "timestamp": "2025-09-30 22:11:31.911328", + "step": 1167, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:31.965873", + "step": 1167, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03768039494752884, + "timestamp": "2025-09-30 22:11:31.974127", + "step": 1168, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.038694", + "step": 1168, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01412307471036911, + "timestamp": "2025-09-30 22:11:32.042393", + "step": 1169, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:32.097717", + "step": 1169, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005952424369752407, + "timestamp": "2025-09-30 22:11:32.105721", + "step": 1170, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:32.162544", + "step": 1170, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02057734504342079, + "timestamp": "2025-09-30 22:11:32.166457", + "step": 1171, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:32.224240", + "step": 1171, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008706753142178059, + "timestamp": "2025-09-30 22:11:32.231341", + "step": 1172, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:32.285741", + "step": 1172, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011364555917680264, + "timestamp": "2025-09-30 22:11:32.289356", + "step": 1173, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.355027", + "step": 1173, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008187648840248585, + "timestamp": "2025-09-30 22:11:32.369898", + "step": 1174, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.433314", + "step": 1174, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01807296834886074, + "timestamp": "2025-09-30 22:11:32.445908", + "step": 1175, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.500680", + "step": 1175, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009041151963174343, + "timestamp": "2025-09-30 22:11:32.517829", + "step": 1176, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:32.575103", + "step": 1176, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022169558331370354, + "timestamp": "2025-09-30 22:11:32.587014", + "step": 1177, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.661750", + "step": 1177, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009721535257995129, + "timestamp": "2025-09-30 22:11:32.674137", + "step": 1178, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.737930", + "step": 1178, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029778840020298958, + "timestamp": "2025-09-30 22:11:32.741818", + "step": 1179, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:32.799497", + "step": 1179, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012037808075547218, + "timestamp": "2025-09-30 22:11:32.808244", + "step": 1180, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:32.863512", + "step": 1180, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028365235775709152, + "timestamp": "2025-09-30 22:11:32.875714", + "step": 1181, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:32.944132", + "step": 1181, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03483065962791443, + "timestamp": "2025-09-30 22:11:32.947386", + "step": 1182, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.001792", + "step": 1182, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018574940040707588, + "timestamp": "2025-09-30 22:11:33.005179", + "step": 1183, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:33.062620", + "step": 1183, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005325800273567438, + "timestamp": "2025-09-30 22:11:33.069033", + "step": 1184, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:33.130798", + "step": 1184, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020132439211010933, + "timestamp": "2025-09-30 22:11:33.141806", + "step": 1185, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:33.198157", + "step": 1185, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01631121151149273, + "timestamp": "2025-09-30 22:11:33.210341", + "step": 1186, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.273179", + "step": 1186, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004564606584608555, + "timestamp": "2025-09-30 22:11:33.278063", + "step": 1187, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.332605", + "step": 1187, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014193967916071415, + "timestamp": "2025-09-30 22:11:33.347094", + "step": 1188, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:33.409336", + "step": 1188, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0075381542555987835, + "timestamp": "2025-09-30 22:11:33.412982", + "step": 1189, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.466666", + "step": 1189, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.042488861829042435, + "timestamp": "2025-09-30 22:11:33.471047", + "step": 1190, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:33.534611", + "step": 1190, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007255176547914743, + "timestamp": "2025-09-30 22:11:33.545876", + "step": 1191, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.602630", + "step": 1191, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02486496791243553, + "timestamp": "2025-09-30 22:11:33.611199", + "step": 1192, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:33.673991", + "step": 1192, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018168171867728233, + "timestamp": "2025-09-30 22:11:33.679538", + "step": 1193, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.735190", + "step": 1193, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016164934495463967, + "timestamp": "2025-09-30 22:11:33.740699", + "step": 1194, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:33.799993", + "step": 1194, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0472748838365078, + "timestamp": "2025-09-30 22:11:33.813257", + "step": 1195, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:33.875956", + "step": 1195, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018774310126900673, + "timestamp": "2025-09-30 22:11:33.883590", + "step": 1196, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:33.938359", + "step": 1196, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.054423097521066666, + "timestamp": "2025-09-30 22:11:33.949340", + "step": 1197, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:35.187868", + "step": 1197, + "epoch": 2 + }, + { + "type": "pplx", + "content": 32067022.826058734, + "timestamp": "2025-09-30 22:11:35.192378", + "step": 1197, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.253335", + "step": 1197, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007513918448239565, + "timestamp": "2025-09-30 22:11:35.256670", + "step": 1198, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.314883", + "step": 1198, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014755907468497753, + "timestamp": "2025-09-30 22:11:35.318113", + "step": 1199, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.378580", + "step": 1199, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021854715421795845, + "timestamp": "2025-09-30 22:11:35.390404", + "step": 1200, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:35.449304", + "step": 1200, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.041134320199489594, + "timestamp": "2025-09-30 22:11:35.452074", + "step": 1201, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.516034", + "step": 1201, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005287197418510914, + "timestamp": "2025-09-30 22:11:35.520991", + "step": 1202, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.577432", + "step": 1202, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038279914297163486, + "timestamp": "2025-09-30 22:11:35.581096", + "step": 1203, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:35.637823", + "step": 1203, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003830043599009514, + "timestamp": "2025-09-30 22:11:35.646802", + "step": 1204, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:35.700118", + "step": 1204, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032837968319654465, + "timestamp": "2025-09-30 22:11:35.704811", + "step": 1205, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.760611", + "step": 1205, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03423704952001572, + "timestamp": "2025-09-30 22:11:35.763826", + "step": 1206, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.823186", + "step": 1206, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008218946866691113, + "timestamp": "2025-09-30 22:11:35.826450", + "step": 1207, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.880558", + "step": 1207, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01721678301692009, + "timestamp": "2025-09-30 22:11:35.887864", + "step": 1208, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:35.941792", + "step": 1208, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019656702876091003, + "timestamp": "2025-09-30 22:11:35.944512", + "step": 1209, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:35.998526", + "step": 1209, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029059452936053276, + "timestamp": "2025-09-30 22:11:36.003965", + "step": 1210, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.058525", + "step": 1210, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030055100098252296, + "timestamp": "2025-09-30 22:11:36.061611", + "step": 1211, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.117192", + "step": 1211, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02078065276145935, + "timestamp": "2025-09-30 22:11:36.130008", + "step": 1212, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:36.184580", + "step": 1212, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010412467643618584, + "timestamp": "2025-09-30 22:11:36.193851", + "step": 1213, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.250863", + "step": 1213, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014479429461061954, + "timestamp": "2025-09-30 22:11:36.254945", + "step": 1214, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.312801", + "step": 1214, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016420889645814896, + "timestamp": "2025-09-30 22:11:36.315643", + "step": 1215, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:36.377191", + "step": 1215, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02330826409161091, + "timestamp": "2025-09-30 22:11:36.383470", + "step": 1216, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:36.437724", + "step": 1216, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018506629392504692, + "timestamp": "2025-09-30 22:11:36.450976", + "step": 1217, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.505571", + "step": 1217, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006810951977968216, + "timestamp": "2025-09-30 22:11:36.509502", + "step": 1218, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:36.567290", + "step": 1218, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014696040190756321, + "timestamp": "2025-09-30 22:11:36.574004", + "step": 1219, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.631116", + "step": 1219, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012312375009059906, + "timestamp": "2025-09-30 22:11:36.637927", + "step": 1220, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.694225", + "step": 1220, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007672054227441549, + "timestamp": "2025-09-30 22:11:36.705527", + "step": 1221, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.763351", + "step": 1221, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008700719103217125, + "timestamp": "2025-09-30 22:11:36.768175", + "step": 1222, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.821917", + "step": 1222, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009297819808125496, + "timestamp": "2025-09-30 22:11:36.824822", + "step": 1223, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.879397", + "step": 1223, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01288828905671835, + "timestamp": "2025-09-30 22:11:36.890240", + "step": 1224, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:36.955515", + "step": 1224, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018189510330557823, + "timestamp": "2025-09-30 22:11:36.958618", + "step": 1225, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.012393", + "step": 1225, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003722636727616191, + "timestamp": "2025-09-30 22:11:37.025019", + "step": 1226, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.080604", + "step": 1226, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014420069754123688, + "timestamp": "2025-09-30 22:11:37.087616", + "step": 1227, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.142487", + "step": 1227, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017777537927031517, + "timestamp": "2025-09-30 22:11:37.148574", + "step": 1228, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.202651", + "step": 1228, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005553639028221369, + "timestamp": "2025-09-30 22:11:37.205175", + "step": 1229, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.269419", + "step": 1229, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02909952774643898, + "timestamp": "2025-09-30 22:11:37.272973", + "step": 1230, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.327626", + "step": 1230, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008751442655920982, + "timestamp": "2025-09-30 22:11:37.331343", + "step": 1231, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.386264", + "step": 1231, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022108396515250206, + "timestamp": "2025-09-30 22:11:37.393911", + "step": 1232, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.447858", + "step": 1232, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038794793654233217, + "timestamp": "2025-09-30 22:11:37.454350", + "step": 1233, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.517045", + "step": 1233, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009794176556169987, + "timestamp": "2025-09-30 22:11:37.520053", + "step": 1234, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:37.575368", + "step": 1234, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02226036600768566, + "timestamp": "2025-09-30 22:11:37.578470", + "step": 1235, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.638983", + "step": 1235, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011220982298254967, + "timestamp": "2025-09-30 22:11:37.645391", + "step": 1236, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:37.705527", + "step": 1236, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010935097001492977, + "timestamp": "2025-09-30 22:11:37.717267", + "step": 1237, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:37.771780", + "step": 1237, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015008168295025826, + "timestamp": "2025-09-30 22:11:37.774709", + "step": 1238, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.828421", + "step": 1238, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03053663857281208, + "timestamp": "2025-09-30 22:11:37.831613", + "step": 1239, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:37.887977", + "step": 1239, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02169874683022499, + "timestamp": "2025-09-30 22:11:37.894966", + "step": 1240, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:37.949789", + "step": 1240, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012753850780427456, + "timestamp": "2025-09-30 22:11:37.952719", + "step": 1241, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:38.006491", + "step": 1241, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013386455364525318, + "timestamp": "2025-09-30 22:11:38.009169", + "step": 1242, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.063334", + "step": 1242, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029180806130170822, + "timestamp": "2025-09-30 22:11:38.067437", + "step": 1243, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.122343", + "step": 1243, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02196519263088703, + "timestamp": "2025-09-30 22:11:38.135050", + "step": 1244, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:38.188177", + "step": 1244, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02870265766978264, + "timestamp": "2025-09-30 22:11:38.190963", + "step": 1245, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:38.245249", + "step": 1245, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023608211427927017, + "timestamp": "2025-09-30 22:11:38.257228", + "step": 1246, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.322615", + "step": 1246, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032148636877536774, + "timestamp": "2025-09-30 22:11:38.326587", + "step": 1247, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:38.380869", + "step": 1247, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014642714522778988, + "timestamp": "2025-09-30 22:11:38.387162", + "step": 1248, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.446848", + "step": 1248, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027036087587475777, + "timestamp": "2025-09-30 22:11:38.450541", + "step": 1249, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:38.505596", + "step": 1249, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02789353020489216, + "timestamp": "2025-09-30 22:11:38.518420", + "step": 1250, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.575018", + "step": 1250, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02393009327352047, + "timestamp": "2025-09-30 22:11:38.578596", + "step": 1251, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:38.632056", + "step": 1251, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010679802857339382, + "timestamp": "2025-09-30 22:11:38.638585", + "step": 1252, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:38.693945", + "step": 1252, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005615527741611004, + "timestamp": "2025-09-30 22:11:38.697404", + "step": 1253, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:38.753186", + "step": 1253, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008227822370827198, + "timestamp": "2025-09-30 22:11:38.756726", + "step": 1254, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:39.978401", + "step": 1254, + "epoch": 2 + }, + { + "type": "pplx", + "content": 30062323.890613366, + "timestamp": "2025-09-30 22:11:39.989728", + "step": 1254, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.042732", + "step": 1254, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007192269433289766, + "timestamp": "2025-09-30 22:11:40.046046", + "step": 1255, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.100429", + "step": 1255, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026882369071245193, + "timestamp": "2025-09-30 22:11:40.106496", + "step": 1256, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.159745", + "step": 1256, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01234396081417799, + "timestamp": "2025-09-30 22:11:40.164224", + "step": 1257, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:40.220058", + "step": 1257, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02563643269240856, + "timestamp": "2025-09-30 22:11:40.223694", + "step": 1258, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.279550", + "step": 1258, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008174607530236244, + "timestamp": "2025-09-30 22:11:40.283730", + "step": 1259, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.344881", + "step": 1259, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015055462718009949, + "timestamp": "2025-09-30 22:11:40.351884", + "step": 1260, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.404793", + "step": 1260, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.044176094233989716, + "timestamp": "2025-09-30 22:11:40.407606", + "step": 1261, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.462171", + "step": 1261, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012365161441266537, + "timestamp": "2025-09-30 22:11:40.466037", + "step": 1262, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.526018", + "step": 1262, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021339308470487595, + "timestamp": "2025-09-30 22:11:40.539809", + "step": 1263, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.594211", + "step": 1263, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013789476826786995, + "timestamp": "2025-09-30 22:11:40.611854", + "step": 1264, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.676229", + "step": 1264, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010516809299588203, + "timestamp": "2025-09-30 22:11:40.679039", + "step": 1265, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:40.733124", + "step": 1265, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01804107055068016, + "timestamp": "2025-09-30 22:11:40.736506", + "step": 1266, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.798080", + "step": 1266, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016653243452310562, + "timestamp": "2025-09-30 22:11:40.801372", + "step": 1267, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.863841", + "step": 1267, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023181870579719543, + "timestamp": "2025-09-30 22:11:40.870269", + "step": 1268, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.923609", + "step": 1268, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016313303261995316, + "timestamp": "2025-09-30 22:11:40.926436", + "step": 1269, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:40.981079", + "step": 1269, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018928783014416695, + "timestamp": "2025-09-30 22:11:40.994550", + "step": 1270, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:41.049139", + "step": 1270, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009127925150096416, + "timestamp": "2025-09-30 22:11:41.057956", + "step": 1271, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.120907", + "step": 1271, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02112032100558281, + "timestamp": "2025-09-30 22:11:41.128416", + "step": 1272, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.182515", + "step": 1272, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009894484654068947, + "timestamp": "2025-09-30 22:11:41.187843", + "step": 1273, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:41.249575", + "step": 1273, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022361796349287033, + "timestamp": "2025-09-30 22:11:41.252800", + "step": 1274, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.313068", + "step": 1274, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011399311013519764, + "timestamp": "2025-09-30 22:11:41.317342", + "step": 1275, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.371745", + "step": 1275, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009731536731123924, + "timestamp": "2025-09-30 22:11:41.378459", + "step": 1276, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:41.433288", + "step": 1276, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015632973983883858, + "timestamp": "2025-09-30 22:11:41.436646", + "step": 1277, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:41.497498", + "step": 1277, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012282473966479301, + "timestamp": "2025-09-30 22:11:41.501010", + "step": 1278, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.554689", + "step": 1278, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03517066687345505, + "timestamp": "2025-09-30 22:11:41.558237", + "step": 1279, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:41.613087", + "step": 1279, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018857363611459732, + "timestamp": "2025-09-30 22:11:41.629174", + "step": 1280, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.684409", + "step": 1280, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009674946777522564, + "timestamp": "2025-09-30 22:11:41.698238", + "step": 1281, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.757905", + "step": 1281, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01863682270050049, + "timestamp": "2025-09-30 22:11:41.761075", + "step": 1282, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:41.818226", + "step": 1282, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00765694584697485, + "timestamp": "2025-09-30 22:11:41.821824", + "step": 1283, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:41.876313", + "step": 1283, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009559721685945988, + "timestamp": "2025-09-30 22:11:41.883478", + "step": 1284, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.936511", + "step": 1284, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024069128558039665, + "timestamp": "2025-09-30 22:11:41.939872", + "step": 1285, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:41.993918", + "step": 1285, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006193791516125202, + "timestamp": "2025-09-30 22:11:41.998037", + "step": 1286, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:42.053332", + "step": 1286, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015762005001306534, + "timestamp": "2025-09-30 22:11:42.063465", + "step": 1287, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.134164", + "step": 1287, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005604383070021868, + "timestamp": "2025-09-30 22:11:42.143673", + "step": 1288, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.197288", + "step": 1288, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03566458821296692, + "timestamp": "2025-09-30 22:11:42.201038", + "step": 1289, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:42.256727", + "step": 1289, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012333127669990063, + "timestamp": "2025-09-30 22:11:42.260609", + "step": 1290, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.316139", + "step": 1290, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020386451855301857, + "timestamp": "2025-09-30 22:11:42.321025", + "step": 1291, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.375757", + "step": 1291, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013080189004540443, + "timestamp": "2025-09-30 22:11:42.382534", + "step": 1292, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.444980", + "step": 1292, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004526190459728241, + "timestamp": "2025-09-30 22:11:42.448558", + "step": 1293, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.503079", + "step": 1293, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0055126287043094635, + "timestamp": "2025-09-30 22:11:42.506004", + "step": 1294, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.572615", + "step": 1294, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005155415739864111, + "timestamp": "2025-09-30 22:11:42.575746", + "step": 1295, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.629780", + "step": 1295, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02876908890902996, + "timestamp": "2025-09-30 22:11:42.637312", + "step": 1296, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:42.692052", + "step": 1296, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019892612472176552, + "timestamp": "2025-09-30 22:11:42.695367", + "step": 1297, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.750139", + "step": 1297, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024674100801348686, + "timestamp": "2025-09-30 22:11:42.753989", + "step": 1298, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:42.809884", + "step": 1298, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024183692410588264, + "timestamp": "2025-09-30 22:11:42.822187", + "step": 1299, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:42.880441", + "step": 1299, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0076071759685873985, + "timestamp": "2025-09-30 22:11:42.887514", + "step": 1300, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:42.945521", + "step": 1300, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014696098864078522, + "timestamp": "2025-09-30 22:11:42.956393", + "step": 1301, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:43.020195", + "step": 1301, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009948963299393654, + "timestamp": "2025-09-30 22:11:43.024109", + "step": 1302, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.079274", + "step": 1302, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018407296389341354, + "timestamp": "2025-09-30 22:11:43.084424", + "step": 1303, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.141234", + "step": 1303, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006092921365052462, + "timestamp": "2025-09-30 22:11:43.148848", + "step": 1304, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.213109", + "step": 1304, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011458395048975945, + "timestamp": "2025-09-30 22:11:43.216786", + "step": 1305, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:43.270249", + "step": 1305, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00822626892477274, + "timestamp": "2025-09-30 22:11:43.273048", + "step": 1306, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.328334", + "step": 1306, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016102908179163933, + "timestamp": "2025-09-30 22:11:43.332627", + "step": 1307, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.388206", + "step": 1307, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013590490445494652, + "timestamp": "2025-09-30 22:11:43.395291", + "step": 1308, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.449410", + "step": 1308, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005517884157598019, + "timestamp": "2025-09-30 22:11:43.456366", + "step": 1309, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:43.510445", + "step": 1309, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014826023019850254, + "timestamp": "2025-09-30 22:11:43.513652", + "step": 1310, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:43.577385", + "step": 1310, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019576190039515495, + "timestamp": "2025-09-30 22:11:43.580375", + "step": 1311, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:44.824236", + "step": 1311, + "epoch": 2 + }, + { + "type": "pplx", + "content": 32609117.843413066, + "timestamp": "2025-09-30 22:11:44.828325", + "step": 1311, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:44.880343", + "step": 1311, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004271751269698143, + "timestamp": "2025-09-30 22:11:44.887291", + "step": 1312, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:44.943672", + "step": 1312, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00479540228843689, + "timestamp": "2025-09-30 22:11:44.947951", + "step": 1313, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.004542", + "step": 1313, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0011680542957037687, + "timestamp": "2025-09-30 22:11:45.014421", + "step": 1314, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:45.068628", + "step": 1314, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014515669085085392, + "timestamp": "2025-09-30 22:11:45.071802", + "step": 1315, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.125654", + "step": 1315, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008202152326703072, + "timestamp": "2025-09-30 22:11:45.133686", + "step": 1316, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.188161", + "step": 1316, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012495539151132107, + "timestamp": "2025-09-30 22:11:45.192099", + "step": 1317, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.246718", + "step": 1317, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031438537407666445, + "timestamp": "2025-09-30 22:11:45.250209", + "step": 1318, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:45.309314", + "step": 1318, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018467547371983528, + "timestamp": "2025-09-30 22:11:45.312481", + "step": 1319, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:45.378622", + "step": 1319, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005382470320910215, + "timestamp": "2025-09-30 22:11:45.389200", + "step": 1320, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.449995", + "step": 1320, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04153914004564285, + "timestamp": "2025-09-30 22:11:45.453345", + "step": 1321, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:45.506615", + "step": 1321, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023785650730133057, + "timestamp": "2025-09-30 22:11:45.509560", + "step": 1322, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:45.565186", + "step": 1322, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010426363907754421, + "timestamp": "2025-09-30 22:11:45.569391", + "step": 1323, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.623237", + "step": 1323, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007811260875314474, + "timestamp": "2025-09-30 22:11:45.631937", + "step": 1324, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:45.684753", + "step": 1324, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00887636374682188, + "timestamp": "2025-09-30 22:11:45.689437", + "step": 1325, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.743278", + "step": 1325, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007312100380659103, + "timestamp": "2025-09-30 22:11:45.746821", + "step": 1326, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.801232", + "step": 1326, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003930172882974148, + "timestamp": "2025-09-30 22:11:45.804113", + "step": 1327, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:45.866226", + "step": 1327, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011160810478031635, + "timestamp": "2025-09-30 22:11:45.881512", + "step": 1328, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:45.935904", + "step": 1328, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017904015257954597, + "timestamp": "2025-09-30 22:11:45.942772", + "step": 1329, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:45.997789", + "step": 1329, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00545323733240366, + "timestamp": "2025-09-30 22:11:46.006031", + "step": 1330, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.062131", + "step": 1330, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04627959802746773, + "timestamp": "2025-09-30 22:11:46.066083", + "step": 1331, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.120957", + "step": 1331, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02013224922120571, + "timestamp": "2025-09-30 22:11:46.127254", + "step": 1332, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.180087", + "step": 1332, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01763824000954628, + "timestamp": "2025-09-30 22:11:46.183576", + "step": 1333, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.238119", + "step": 1333, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01840882934629917, + "timestamp": "2025-09-30 22:11:46.241142", + "step": 1334, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.302685", + "step": 1334, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029163610190153122, + "timestamp": "2025-09-30 22:11:46.306239", + "step": 1335, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.365479", + "step": 1335, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005280309822410345, + "timestamp": "2025-09-30 22:11:46.372879", + "step": 1336, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:46.427591", + "step": 1336, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016217708587646484, + "timestamp": "2025-09-30 22:11:46.430872", + "step": 1337, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.485684", + "step": 1337, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016818564385175705, + "timestamp": "2025-09-30 22:11:46.491284", + "step": 1338, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:46.566008", + "step": 1338, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005953342653810978, + "timestamp": "2025-09-30 22:11:46.569427", + "step": 1339, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:46.626744", + "step": 1339, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019312188029289246, + "timestamp": "2025-09-30 22:11:46.633310", + "step": 1340, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:46.694266", + "step": 1340, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014265080913901329, + "timestamp": "2025-09-30 22:11:46.697938", + "step": 1341, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:46.759167", + "step": 1341, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014429310336709023, + "timestamp": "2025-09-30 22:11:46.762088", + "step": 1342, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:46.833476", + "step": 1342, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01076581608504057, + "timestamp": "2025-09-30 22:11:46.843812", + "step": 1343, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.909121", + "step": 1343, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019898299127817154, + "timestamp": "2025-09-30 22:11:46.917240", + "step": 1344, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:46.977775", + "step": 1344, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010426754597574472, + "timestamp": "2025-09-30 22:11:46.981247", + "step": 1345, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.039218", + "step": 1345, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022380618378520012, + "timestamp": "2025-09-30 22:11:47.045477", + "step": 1346, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:47.100496", + "step": 1346, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012452373281121254, + "timestamp": "2025-09-30 22:11:47.106816", + "step": 1347, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.161611", + "step": 1347, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002341157989576459, + "timestamp": "2025-09-30 22:11:47.169873", + "step": 1348, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:47.224614", + "step": 1348, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002771148458123207, + "timestamp": "2025-09-30 22:11:47.227269", + "step": 1349, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.282287", + "step": 1349, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010354334488511086, + "timestamp": "2025-09-30 22:11:47.285453", + "step": 1350, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:47.347031", + "step": 1350, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009396574459969997, + "timestamp": "2025-09-30 22:11:47.349952", + "step": 1351, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.406731", + "step": 1351, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003213896183297038, + "timestamp": "2025-09-30 22:11:47.413957", + "step": 1352, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.469778", + "step": 1352, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004314431454986334, + "timestamp": "2025-09-30 22:11:47.473353", + "step": 1353, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.529789", + "step": 1353, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013174169696867466, + "timestamp": "2025-09-30 22:11:47.533184", + "step": 1354, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.590717", + "step": 1354, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015209296718239784, + "timestamp": "2025-09-30 22:11:47.594409", + "step": 1355, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.650799", + "step": 1355, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038054872304201126, + "timestamp": "2025-09-30 22:11:47.658833", + "step": 1356, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.712960", + "step": 1356, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009656942449510098, + "timestamp": "2025-09-30 22:11:47.716647", + "step": 1357, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.777202", + "step": 1357, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01215650700032711, + "timestamp": "2025-09-30 22:11:47.786134", + "step": 1358, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:47.846371", + "step": 1358, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015105058439075947, + "timestamp": "2025-09-30 22:11:47.854309", + "step": 1359, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:47.914321", + "step": 1359, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002900704275816679, + "timestamp": "2025-09-30 22:11:47.926334", + "step": 1360, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:47.980560", + "step": 1360, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020952101796865463, + "timestamp": "2025-09-30 22:11:47.987594", + "step": 1361, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:48.043298", + "step": 1361, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015315545722842216, + "timestamp": "2025-09-30 22:11:48.050262", + "step": 1362, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:48.107218", + "step": 1362, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0064353845082223415, + "timestamp": "2025-09-30 22:11:48.110270", + "step": 1363, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:48.169342", + "step": 1363, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006530344020575285, + "timestamp": "2025-09-30 22:11:48.179583", + "step": 1364, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:48.241263", + "step": 1364, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017590684816241264, + "timestamp": "2025-09-30 22:11:48.249981", + "step": 1365, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:48.310235", + "step": 1365, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007951964624226093, + "timestamp": "2025-09-30 22:11:48.318949", + "step": 1366, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:48.374804", + "step": 1366, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01660231314599514, + "timestamp": "2025-09-30 22:11:48.377100", + "step": 1367, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:48.438207", + "step": 1367, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007098283153027296, + "timestamp": "2025-09-30 22:11:48.445739", + "step": 1368, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:49.664039", + "step": 1368, + "epoch": 2 + }, + { + "type": "pplx", + "content": 33795719.590478756, + "timestamp": "2025-09-30 22:11:49.666611", + "step": 1368, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:49.719047", + "step": 1368, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031422681640833616, + "timestamp": "2025-09-30 22:11:49.721231", + "step": 1369, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:49.775273", + "step": 1369, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007872308604419231, + "timestamp": "2025-09-30 22:11:49.778345", + "step": 1370, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:49.832827", + "step": 1370, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011166645213961601, + "timestamp": "2025-09-30 22:11:49.838286", + "step": 1371, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:49.892684", + "step": 1371, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008558930829167366, + "timestamp": "2025-09-30 22:11:49.899692", + "step": 1372, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:49.956402", + "step": 1372, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01813081093132496, + "timestamp": "2025-09-30 22:11:49.960747", + "step": 1373, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.015076", + "step": 1373, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014572428539395332, + "timestamp": "2025-09-30 22:11:50.017794", + "step": 1374, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:50.071943", + "step": 1374, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025154201313853264, + "timestamp": "2025-09-30 22:11:50.074765", + "step": 1375, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.134387", + "step": 1375, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006909341551363468, + "timestamp": "2025-09-30 22:11:50.143622", + "step": 1376, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:50.199261", + "step": 1376, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024538232013583183, + "timestamp": "2025-09-30 22:11:50.201749", + "step": 1377, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.258024", + "step": 1377, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016891105100512505, + "timestamp": "2025-09-30 22:11:50.261650", + "step": 1378, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.316502", + "step": 1378, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02443709410727024, + "timestamp": "2025-09-30 22:11:50.319617", + "step": 1379, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.373500", + "step": 1379, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00553086306899786, + "timestamp": "2025-09-30 22:11:50.381388", + "step": 1380, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.435108", + "step": 1380, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02132841758430004, + "timestamp": "2025-09-30 22:11:50.442804", + "step": 1381, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.500292", + "step": 1381, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00981497298926115, + "timestamp": "2025-09-30 22:11:50.503681", + "step": 1382, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.557827", + "step": 1382, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0064003728330135345, + "timestamp": "2025-09-30 22:11:50.561098", + "step": 1383, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.615611", + "step": 1383, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0022823542822152376, + "timestamp": "2025-09-30 22:11:50.622947", + "step": 1384, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.678363", + "step": 1384, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02859966643154621, + "timestamp": "2025-09-30 22:11:50.681709", + "step": 1385, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.735485", + "step": 1385, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017722919583320618, + "timestamp": "2025-09-30 22:11:50.738377", + "step": 1386, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.792609", + "step": 1386, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03379681333899498, + "timestamp": "2025-09-30 22:11:50.796363", + "step": 1387, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.849961", + "step": 1387, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003414042294025421, + "timestamp": "2025-09-30 22:11:50.859211", + "step": 1388, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:50.914429", + "step": 1388, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014491712674498558, + "timestamp": "2025-09-30 22:11:50.921126", + "step": 1389, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:50.977992", + "step": 1389, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009958495385944843, + "timestamp": "2025-09-30 22:11:50.980673", + "step": 1390, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:51.041085", + "step": 1390, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006238664500415325, + "timestamp": "2025-09-30 22:11:51.047289", + "step": 1391, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:51.105263", + "step": 1391, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014195479452610016, + "timestamp": "2025-09-30 22:11:51.118187", + "step": 1392, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.175096", + "step": 1392, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00753815146163106, + "timestamp": "2025-09-30 22:11:51.178464", + "step": 1393, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.234581", + "step": 1393, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013055319897830486, + "timestamp": "2025-09-30 22:11:51.237281", + "step": 1394, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.292813", + "step": 1394, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004737398587167263, + "timestamp": "2025-09-30 22:11:51.294813", + "step": 1395, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.348986", + "step": 1395, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002370731672272086, + "timestamp": "2025-09-30 22:11:51.357468", + "step": 1396, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.411158", + "step": 1396, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002246677177026868, + "timestamp": "2025-09-30 22:11:51.417173", + "step": 1397, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.475633", + "step": 1397, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05915753170847893, + "timestamp": "2025-09-30 22:11:51.481055", + "step": 1398, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.536694", + "step": 1398, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007206898648291826, + "timestamp": "2025-09-30 22:11:51.540164", + "step": 1399, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:51.596906", + "step": 1399, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003330029547214508, + "timestamp": "2025-09-30 22:11:51.604499", + "step": 1400, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.661337", + "step": 1400, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01776135340332985, + "timestamp": "2025-09-30 22:11:51.664234", + "step": 1401, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:51.719714", + "step": 1401, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007816934958100319, + "timestamp": "2025-09-30 22:11:51.722380", + "step": 1402, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.776606", + "step": 1402, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006343926768749952, + "timestamp": "2025-09-30 22:11:51.779877", + "step": 1403, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:51.834652", + "step": 1403, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01646248623728752, + "timestamp": "2025-09-30 22:11:51.842243", + "step": 1404, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.897403", + "step": 1404, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011257813312113285, + "timestamp": "2025-09-30 22:11:51.900431", + "step": 1405, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:51.953481", + "step": 1405, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010497097857296467, + "timestamp": "2025-09-30 22:11:51.959746", + "step": 1406, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:52.015068", + "step": 1406, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002651577116921544, + "timestamp": "2025-09-30 22:11:52.019159", + "step": 1407, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.072606", + "step": 1407, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03266320377588272, + "timestamp": "2025-09-30 22:11:52.085621", + "step": 1408, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:52.141617", + "step": 1408, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01602781191468239, + "timestamp": "2025-09-30 22:11:52.146954", + "step": 1409, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.203737", + "step": 1409, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002372512361034751, + "timestamp": "2025-09-30 22:11:52.207368", + "step": 1410, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.262628", + "step": 1410, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014318128116428852, + "timestamp": "2025-09-30 22:11:52.266816", + "step": 1411, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.323101", + "step": 1411, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00335204997099936, + "timestamp": "2025-09-30 22:11:52.333507", + "step": 1412, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.399815", + "step": 1412, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006724204868078232, + "timestamp": "2025-09-30 22:11:52.405354", + "step": 1413, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:52.460557", + "step": 1413, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008304744958877563, + "timestamp": "2025-09-30 22:11:52.470446", + "step": 1414, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:52.532792", + "step": 1414, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013678102754056454, + "timestamp": "2025-09-30 22:11:52.535675", + "step": 1415, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.590913", + "step": 1415, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00877212081104517, + "timestamp": "2025-09-30 22:11:52.596849", + "step": 1416, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.649257", + "step": 1416, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023188970517367125, + "timestamp": "2025-09-30 22:11:52.651657", + "step": 1417, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.706530", + "step": 1417, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002683415310457349, + "timestamp": "2025-09-30 22:11:52.709269", + "step": 1418, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:52.772989", + "step": 1418, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006297766696661711, + "timestamp": "2025-09-30 22:11:52.775729", + "step": 1419, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.830863", + "step": 1419, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018091067671775818, + "timestamp": "2025-09-30 22:11:52.838563", + "step": 1420, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.892921", + "step": 1420, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015514843165874481, + "timestamp": "2025-09-30 22:11:52.895667", + "step": 1421, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:52.949542", + "step": 1421, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016685453010722995, + "timestamp": "2025-09-30 22:11:52.959052", + "step": 1422, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:53.015770", + "step": 1422, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04028955474495888, + "timestamp": "2025-09-30 22:11:53.018535", + "step": 1423, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:53.074396", + "step": 1423, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018086543306708336, + "timestamp": "2025-09-30 22:11:53.081604", + "step": 1424, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:53.137519", + "step": 1424, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03144695982336998, + "timestamp": "2025-09-30 22:11:53.145979", + "step": 1425, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:54.385788", + "step": 1425, + "epoch": 2 + }, + { + "type": "pplx", + "content": 32876480.431024157, + "timestamp": "2025-09-30 22:11:54.388368", + "step": 1425, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.441452", + "step": 1425, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008492738706991076, + "timestamp": "2025-09-30 22:11:54.444996", + "step": 1426, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.497804", + "step": 1426, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006496089976280928, + "timestamp": "2025-09-30 22:11:54.500486", + "step": 1427, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.554857", + "step": 1427, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.046025075018405914, + "timestamp": "2025-09-30 22:11:54.561183", + "step": 1428, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:54.620775", + "step": 1428, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004891328979283571, + "timestamp": "2025-09-30 22:11:54.623529", + "step": 1429, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.678675", + "step": 1429, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006063086446374655, + "timestamp": "2025-09-30 22:11:54.689551", + "step": 1430, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.744569", + "step": 1430, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015636909753084183, + "timestamp": "2025-09-30 22:11:54.746833", + "step": 1431, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.801259", + "step": 1431, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009860769845545292, + "timestamp": "2025-09-30 22:11:54.812696", + "step": 1432, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:54.867412", + "step": 1432, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021920878440141678, + "timestamp": "2025-09-30 22:11:54.873928", + "step": 1433, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:54.933680", + "step": 1433, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03438463434576988, + "timestamp": "2025-09-30 22:11:54.935946", + "step": 1434, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:54.990181", + "step": 1434, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010635143145918846, + "timestamp": "2025-09-30 22:11:54.992804", + "step": 1435, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.055330", + "step": 1435, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00384154194034636, + "timestamp": "2025-09-30 22:11:55.062260", + "step": 1436, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.116261", + "step": 1436, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01956865005195141, + "timestamp": "2025-09-30 22:11:55.126079", + "step": 1437, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.184259", + "step": 1437, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011690896935760975, + "timestamp": "2025-09-30 22:11:55.187051", + "step": 1438, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.240826", + "step": 1438, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0043997629545629025, + "timestamp": "2025-09-30 22:11:55.242792", + "step": 1439, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.295661", + "step": 1439, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012786614242941141, + "timestamp": "2025-09-30 22:11:55.301411", + "step": 1440, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.353838", + "step": 1440, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001398958032950759, + "timestamp": "2025-09-30 22:11:55.356227", + "step": 1441, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.412017", + "step": 1441, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023240819573402405, + "timestamp": "2025-09-30 22:11:55.417275", + "step": 1442, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.470340", + "step": 1442, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014607422053813934, + "timestamp": "2025-09-30 22:11:55.475088", + "step": 1443, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.528302", + "step": 1443, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001739148749038577, + "timestamp": "2025-09-30 22:11:55.534596", + "step": 1444, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.591866", + "step": 1444, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011579596437513828, + "timestamp": "2025-09-30 22:11:55.595434", + "step": 1445, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.649148", + "step": 1445, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01198033057153225, + "timestamp": "2025-09-30 22:11:55.651642", + "step": 1446, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.705180", + "step": 1446, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010683462955057621, + "timestamp": "2025-09-30 22:11:55.707304", + "step": 1447, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.762754", + "step": 1447, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010873474180698395, + "timestamp": "2025-09-30 22:11:55.768493", + "step": 1448, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.821216", + "step": 1448, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016803989186882973, + "timestamp": "2025-09-30 22:11:55.824294", + "step": 1449, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:55.879996", + "step": 1449, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024432357400655746, + "timestamp": "2025-09-30 22:11:55.883382", + "step": 1450, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.937573", + "step": 1450, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01734604313969612, + "timestamp": "2025-09-30 22:11:55.941133", + "step": 1451, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:55.994416", + "step": 1451, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002315593184903264, + "timestamp": "2025-09-30 22:11:56.001288", + "step": 1452, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.055517", + "step": 1452, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0013011764967814088, + "timestamp": "2025-09-30 22:11:56.057536", + "step": 1453, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.111279", + "step": 1453, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014267000369727612, + "timestamp": "2025-09-30 22:11:56.115214", + "step": 1454, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.177103", + "step": 1454, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01388590969145298, + "timestamp": "2025-09-30 22:11:56.179331", + "step": 1455, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.231796", + "step": 1455, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03596239164471626, + "timestamp": "2025-09-30 22:11:56.238002", + "step": 1456, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.291186", + "step": 1456, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008110507391393185, + "timestamp": "2025-09-30 22:11:56.293601", + "step": 1457, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.346799", + "step": 1457, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005775760859251022, + "timestamp": "2025-09-30 22:11:56.348855", + "step": 1458, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.401731", + "step": 1458, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02051071636378765, + "timestamp": "2025-09-30 22:11:56.403960", + "step": 1459, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.457070", + "step": 1459, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003823460778221488, + "timestamp": "2025-09-30 22:11:56.463416", + "step": 1460, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.516251", + "step": 1460, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05588651821017265, + "timestamp": "2025-09-30 22:11:56.519240", + "step": 1461, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.572359", + "step": 1461, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011291184462606907, + "timestamp": "2025-09-30 22:11:56.574855", + "step": 1462, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.638559", + "step": 1462, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02610655501484871, + "timestamp": "2025-09-30 22:11:56.640892", + "step": 1463, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.694525", + "step": 1463, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025093723088502884, + "timestamp": "2025-09-30 22:11:56.702308", + "step": 1464, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:56.754189", + "step": 1464, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018963143229484558, + "timestamp": "2025-09-30 22:11:56.762237", + "step": 1465, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.815551", + "step": 1465, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02996927499771118, + "timestamp": "2025-09-30 22:11:56.818951", + "step": 1466, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.871563", + "step": 1466, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031889282166957855, + "timestamp": "2025-09-30 22:11:56.874567", + "step": 1467, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:56.929697", + "step": 1467, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013061118312180042, + "timestamp": "2025-09-30 22:11:56.938671", + "step": 1468, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:56.992670", + "step": 1468, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021829981356859207, + "timestamp": "2025-09-30 22:11:56.995200", + "step": 1469, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.048959", + "step": 1469, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.048463720828294754, + "timestamp": "2025-09-30 22:11:57.051096", + "step": 1470, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:57.105679", + "step": 1470, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0270835030823946, + "timestamp": "2025-09-30 22:11:57.107848", + "step": 1471, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.160647", + "step": 1471, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007283608429133892, + "timestamp": "2025-09-30 22:11:57.166422", + "step": 1472, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:57.221551", + "step": 1472, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014574305154383183, + "timestamp": "2025-09-30 22:11:57.223680", + "step": 1473, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:57.280308", + "step": 1473, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014809882268309593, + "timestamp": "2025-09-30 22:11:57.282233", + "step": 1474, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:57.335185", + "step": 1474, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010421657003462315, + "timestamp": "2025-09-30 22:11:57.337900", + "step": 1475, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:57.391394", + "step": 1475, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025253277271986008, + "timestamp": "2025-09-30 22:11:57.399377", + "step": 1476, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:57.452018", + "step": 1476, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005706873722374439, + "timestamp": "2025-09-30 22:11:57.454324", + "step": 1477, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.507110", + "step": 1477, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007750978227704763, + "timestamp": "2025-09-30 22:11:57.510333", + "step": 1478, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.563595", + "step": 1478, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0170641727745533, + "timestamp": "2025-09-30 22:11:57.565865", + "step": 1479, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.618452", + "step": 1479, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010642343200743198, + "timestamp": "2025-09-30 22:11:57.624447", + "step": 1480, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:57.677415", + "step": 1480, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022640392184257507, + "timestamp": "2025-09-30 22:11:57.680487", + "step": 1481, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:57.733786", + "step": 1481, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012234336696565151, + "timestamp": "2025-09-30 22:11:57.736389", + "step": 1482, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:11:58.929466", + "step": 1482, + "epoch": 2 + }, + { + "type": "pplx", + "content": 31872365.783038512, + "timestamp": "2025-09-30 22:11:58.932319", + "step": 1482, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:58.984024", + "step": 1482, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016802014783024788, + "timestamp": "2025-09-30 22:11:58.986699", + "step": 1483, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.041050", + "step": 1483, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010017321445047855, + "timestamp": "2025-09-30 22:11:59.047373", + "step": 1484, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:59.103047", + "step": 1484, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01991168037056923, + "timestamp": "2025-09-30 22:11:59.106592", + "step": 1485, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.161064", + "step": 1485, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02465466596186161, + "timestamp": "2025-09-30 22:11:59.163538", + "step": 1486, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.218736", + "step": 1486, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011366413906216621, + "timestamp": "2025-09-30 22:11:59.221346", + "step": 1487, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.275277", + "step": 1487, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009791248477995396, + "timestamp": "2025-09-30 22:11:59.282523", + "step": 1488, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.335043", + "step": 1488, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012804691679775715, + "timestamp": "2025-09-30 22:11:59.337565", + "step": 1489, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:59.391128", + "step": 1489, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008853507228195667, + "timestamp": "2025-09-30 22:11:59.394664", + "step": 1490, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.448734", + "step": 1490, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007088639307767153, + "timestamp": "2025-09-30 22:11:59.451173", + "step": 1491, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.504600", + "step": 1491, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017957476899027824, + "timestamp": "2025-09-30 22:11:59.511239", + "step": 1492, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.564635", + "step": 1492, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011476176790893078, + "timestamp": "2025-09-30 22:11:59.567245", + "step": 1493, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:11:59.622806", + "step": 1493, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014955559745430946, + "timestamp": "2025-09-30 22:11:59.625363", + "step": 1494, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:11:59.678610", + "step": 1494, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020908983424305916, + "timestamp": "2025-09-30 22:11:59.681901", + "step": 1495, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.737652", + "step": 1495, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007830632850527763, + "timestamp": "2025-09-30 22:11:59.743947", + "step": 1496, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.797627", + "step": 1496, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007764595095068216, + "timestamp": "2025-09-30 22:11:59.800420", + "step": 1497, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:59.855257", + "step": 1497, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01050377357751131, + "timestamp": "2025-09-30 22:11:59.857838", + "step": 1498, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:11:59.913101", + "step": 1498, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008915500715374947, + "timestamp": "2025-09-30 22:11:59.915703", + "step": 1499, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:11:59.971399", + "step": 1499, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015954632312059402, + "timestamp": "2025-09-30 22:11:59.977693", + "step": 1500, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1500", + "timestamp": "2025-09-30 22:12:00.388871", + "step": 1500, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:00.447231", + "step": 1500, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011480267159640789, + "timestamp": "2025-09-30 22:12:00.449372", + "step": 1501, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.503242", + "step": 1501, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010660061612725258, + "timestamp": "2025-09-30 22:12:00.505412", + "step": 1502, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.559660", + "step": 1502, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00754969660192728, + "timestamp": "2025-09-30 22:12:00.562034", + "step": 1503, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.619455", + "step": 1503, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006606790237128735, + "timestamp": "2025-09-30 22:12:00.625333", + "step": 1504, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:00.684398", + "step": 1504, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024877065792679787, + "timestamp": "2025-09-30 22:12:00.686471", + "step": 1505, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.740071", + "step": 1505, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018199782818555832, + "timestamp": "2025-09-30 22:12:00.742556", + "step": 1506, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.798637", + "step": 1506, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024665992707014084, + "timestamp": "2025-09-30 22:12:00.800950", + "step": 1507, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-30 22:12:00.860320", + "step": 1507, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010990189388394356, + "timestamp": "2025-09-30 22:12:00.866243", + "step": 1508, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.919912", + "step": 1508, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011042303405702114, + "timestamp": "2025-09-30 22:12:00.922231", + "step": 1509, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:00.976152", + "step": 1509, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016723833978176117, + "timestamp": "2025-09-30 22:12:00.978834", + "step": 1510, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:01.033142", + "step": 1510, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0033279487397521734, + "timestamp": "2025-09-30 22:12:01.035168", + "step": 1511, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.088589", + "step": 1511, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012400150299072266, + "timestamp": "2025-09-30 22:12:01.094521", + "step": 1512, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.150517", + "step": 1512, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011185402050614357, + "timestamp": "2025-09-30 22:12:01.153351", + "step": 1513, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:01.207943", + "step": 1513, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01125926524400711, + "timestamp": "2025-09-30 22:12:01.210391", + "step": 1514, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.265310", + "step": 1514, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009529463946819305, + "timestamp": "2025-09-30 22:12:01.267633", + "step": 1515, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:01.329492", + "step": 1515, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03866703063249588, + "timestamp": "2025-09-30 22:12:01.335633", + "step": 1516, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.393980", + "step": 1516, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009177983738481998, + "timestamp": "2025-09-30 22:12:01.396395", + "step": 1517, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.453397", + "step": 1517, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02175569161772728, + "timestamp": "2025-09-30 22:12:01.455615", + "step": 1518, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.508806", + "step": 1518, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012183894403278828, + "timestamp": "2025-09-30 22:12:01.510982", + "step": 1519, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.566324", + "step": 1519, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008869107812643051, + "timestamp": "2025-09-30 22:12:01.574362", + "step": 1520, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.628660", + "step": 1520, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021661145612597466, + "timestamp": "2025-09-30 22:12:01.631165", + "step": 1521, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.691533", + "step": 1521, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019203439354896545, + "timestamp": "2025-09-30 22:12:01.695020", + "step": 1522, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:01.749280", + "step": 1522, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010725765489041805, + "timestamp": "2025-09-30 22:12:01.751519", + "step": 1523, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.806614", + "step": 1523, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009876924566924572, + "timestamp": "2025-09-30 22:12:01.812145", + "step": 1524, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:01.867986", + "step": 1524, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011706030927598476, + "timestamp": "2025-09-30 22:12:01.870018", + "step": 1525, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:01.923199", + "step": 1525, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011448273435235023, + "timestamp": "2025-09-30 22:12:01.925038", + "step": 1526, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:01.977941", + "step": 1526, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009272503666579723, + "timestamp": "2025-09-30 22:12:01.979995", + "step": 1527, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.033727", + "step": 1527, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05378647893667221, + "timestamp": "2025-09-30 22:12:02.039275", + "step": 1528, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.091647", + "step": 1528, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024520421400666237, + "timestamp": "2025-09-30 22:12:02.093742", + "step": 1529, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.146154", + "step": 1529, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024694928899407387, + "timestamp": "2025-09-30 22:12:02.148676", + "step": 1530, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.203002", + "step": 1530, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007078372407704592, + "timestamp": "2025-09-30 22:12:02.205450", + "step": 1531, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.259443", + "step": 1531, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030642852187156677, + "timestamp": "2025-09-30 22:12:02.265928", + "step": 1532, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.319607", + "step": 1532, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016413046047091484, + "timestamp": "2025-09-30 22:12:02.321885", + "step": 1533, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:02.377357", + "step": 1533, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026809586212038994, + "timestamp": "2025-09-30 22:12:02.379738", + "step": 1534, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.434324", + "step": 1534, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02859729900956154, + "timestamp": "2025-09-30 22:12:02.437108", + "step": 1535, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.492084", + "step": 1535, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013830293901264668, + "timestamp": "2025-09-30 22:12:02.497710", + "step": 1536, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:02.550441", + "step": 1536, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01072558481246233, + "timestamp": "2025-09-30 22:12:02.553119", + "step": 1537, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:02.612546", + "step": 1537, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03165407106280327, + "timestamp": "2025-09-30 22:12:02.614540", + "step": 1538, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:02.669484", + "step": 1538, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006038249935954809, + "timestamp": "2025-09-30 22:12:02.671470", + "step": 1539, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:03.885059", + "step": 1539, + "epoch": 2 + }, + { + "type": "pplx", + "content": 35072222.03899579, + "timestamp": "2025-09-30 22:12:03.886596", + "step": 1539, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:03.938357", + "step": 1539, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019516736268997192, + "timestamp": "2025-09-30 22:12:03.944029", + "step": 1540, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:03.997583", + "step": 1540, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01919463463127613, + "timestamp": "2025-09-30 22:12:03.999837", + "step": 1541, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.053989", + "step": 1541, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02023499831557274, + "timestamp": "2025-09-30 22:12:04.057050", + "step": 1542, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.109535", + "step": 1542, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0022135619074106216, + "timestamp": "2025-09-30 22:12:04.111673", + "step": 1543, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.164050", + "step": 1543, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008789089508354664, + "timestamp": "2025-09-30 22:12:04.170203", + "step": 1544, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.224306", + "step": 1544, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013264109380543232, + "timestamp": "2025-09-30 22:12:04.231236", + "step": 1545, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.286705", + "step": 1545, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035324085503816605, + "timestamp": "2025-09-30 22:12:04.291376", + "step": 1546, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.347304", + "step": 1546, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028284629806876183, + "timestamp": "2025-09-30 22:12:04.349678", + "step": 1547, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.405061", + "step": 1547, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007114126812666655, + "timestamp": "2025-09-30 22:12:04.410655", + "step": 1548, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:04.464055", + "step": 1548, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0037361420691013336, + "timestamp": "2025-09-30 22:12:04.466154", + "step": 1549, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.519128", + "step": 1549, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00416518421843648, + "timestamp": "2025-09-30 22:12:04.521220", + "step": 1550, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.574047", + "step": 1550, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012675133533775806, + "timestamp": "2025-09-30 22:12:04.576126", + "step": 1551, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:04.629683", + "step": 1551, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018537061288952827, + "timestamp": "2025-09-30 22:12:04.635243", + "step": 1552, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.688065", + "step": 1552, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010581501759588718, + "timestamp": "2025-09-30 22:12:04.690083", + "step": 1553, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:04.743054", + "step": 1553, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0007142522372305393, + "timestamp": "2025-09-30 22:12:04.745197", + "step": 1554, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.798016", + "step": 1554, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03923966363072395, + "timestamp": "2025-09-30 22:12:04.800205", + "step": 1555, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.854036", + "step": 1555, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02381220832467079, + "timestamp": "2025-09-30 22:12:04.859829", + "step": 1556, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.912029", + "step": 1556, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002656190888956189, + "timestamp": "2025-09-30 22:12:04.914059", + "step": 1557, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:04.969018", + "step": 1557, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009249404072761536, + "timestamp": "2025-09-30 22:12:04.971667", + "step": 1558, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.026246", + "step": 1558, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013429306447505951, + "timestamp": "2025-09-30 22:12:05.028394", + "step": 1559, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.081243", + "step": 1559, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0022556984331458807, + "timestamp": "2025-09-30 22:12:05.086891", + "step": 1560, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.139568", + "step": 1560, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.039768513292074203, + "timestamp": "2025-09-30 22:12:05.141838", + "step": 1561, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.194858", + "step": 1561, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002185255754739046, + "timestamp": "2025-09-30 22:12:05.197796", + "step": 1562, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.250587", + "step": 1562, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03695518150925636, + "timestamp": "2025-09-30 22:12:05.253676", + "step": 1563, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.309805", + "step": 1563, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006958664394915104, + "timestamp": "2025-09-30 22:12:05.315287", + "step": 1564, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.367430", + "step": 1564, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001130191725678742, + "timestamp": "2025-09-30 22:12:05.370117", + "step": 1565, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:05.423414", + "step": 1565, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016752462834119797, + "timestamp": "2025-09-30 22:12:05.425530", + "step": 1566, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:05.477789", + "step": 1566, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0018236959585919976, + "timestamp": "2025-09-30 22:12:05.480120", + "step": 1567, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.532586", + "step": 1567, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022834938019514084, + "timestamp": "2025-09-30 22:12:05.538287", + "step": 1568, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.590319", + "step": 1568, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006333382334560156, + "timestamp": "2025-09-30 22:12:05.592495", + "step": 1569, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.644948", + "step": 1569, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012706826440989971, + "timestamp": "2025-09-30 22:12:05.647109", + "step": 1570, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.699654", + "step": 1570, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02377459593117237, + "timestamp": "2025-09-30 22:12:05.701745", + "step": 1571, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.754334", + "step": 1571, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011410296894609928, + "timestamp": "2025-09-30 22:12:05.760192", + "step": 1572, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.815069", + "step": 1572, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004736430011689663, + "timestamp": "2025-09-30 22:12:05.817268", + "step": 1573, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:05.869880", + "step": 1573, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001076001557521522, + "timestamp": "2025-09-30 22:12:05.872137", + "step": 1574, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.925587", + "step": 1574, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023327454924583435, + "timestamp": "2025-09-30 22:12:05.927859", + "step": 1575, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:05.981165", + "step": 1575, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017735332949087024, + "timestamp": "2025-09-30 22:12:05.987084", + "step": 1576, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.039957", + "step": 1576, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010920924134552479, + "timestamp": "2025-09-30 22:12:06.042225", + "step": 1577, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.096214", + "step": 1577, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028373869135975838, + "timestamp": "2025-09-30 22:12:06.098273", + "step": 1578, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.152111", + "step": 1578, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025909452233463526, + "timestamp": "2025-09-30 22:12:06.154864", + "step": 1579, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.209637", + "step": 1579, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01281531248241663, + "timestamp": "2025-09-30 22:12:06.216014", + "step": 1580, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:06.269328", + "step": 1580, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01906949281692505, + "timestamp": "2025-09-30 22:12:06.272333", + "step": 1581, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.326030", + "step": 1581, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008657933212816715, + "timestamp": "2025-09-30 22:12:06.328046", + "step": 1582, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.380427", + "step": 1582, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011052996851503849, + "timestamp": "2025-09-30 22:12:06.382745", + "step": 1583, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.436203", + "step": 1583, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008702821098268032, + "timestamp": "2025-09-30 22:12:06.441965", + "step": 1584, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.493780", + "step": 1584, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01199591439217329, + "timestamp": "2025-09-30 22:12:06.495960", + "step": 1585, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.548081", + "step": 1585, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02052219584584236, + "timestamp": "2025-09-30 22:12:06.550557", + "step": 1586, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:06.603416", + "step": 1586, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029241319745779037, + "timestamp": "2025-09-30 22:12:06.605525", + "step": 1587, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.657839", + "step": 1587, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004892691969871521, + "timestamp": "2025-09-30 22:12:06.663407", + "step": 1588, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.727359", + "step": 1588, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021011924371123314, + "timestamp": "2025-09-30 22:12:06.729566", + "step": 1589, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.782007", + "step": 1589, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01708308607339859, + "timestamp": "2025-09-30 22:12:06.784478", + "step": 1590, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.837235", + "step": 1590, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035234589129686356, + "timestamp": "2025-09-30 22:12:06.839857", + "step": 1591, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.892589", + "step": 1591, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.047162704169750214, + "timestamp": "2025-09-30 22:12:06.898404", + "step": 1592, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:06.950346", + "step": 1592, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010256440378725529, + "timestamp": "2025-09-30 22:12:06.952596", + "step": 1593, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:07.005219", + "step": 1593, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011917630210518837, + "timestamp": "2025-09-30 22:12:07.007233", + "step": 1594, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:07.062589", + "step": 1594, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019979296252131462, + "timestamp": "2025-09-30 22:12:07.064654", + "step": 1595, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:07.117139", + "step": 1595, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020958244800567627, + "timestamp": "2025-09-30 22:12:07.122974", + "step": 1596, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:08.315114", + "step": 1596, + "epoch": 2 + }, + { + "type": "pplx", + "content": 33021765.354655504, + "timestamp": "2025-09-30 22:12:08.317184", + "step": 1596, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:08.368251", + "step": 1596, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00980797503143549, + "timestamp": "2025-09-30 22:12:08.370313", + "step": 1597, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:08.433892", + "step": 1597, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009293398819863796, + "timestamp": "2025-09-30 22:12:08.435957", + "step": 1598, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:08.488754", + "step": 1598, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015030805952847004, + "timestamp": "2025-09-30 22:12:08.491114", + "step": 1599, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:08.549008", + "step": 1599, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011158740147948265, + "timestamp": "2025-09-30 22:12:08.554545", + "step": 1600, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:08.610775", + "step": 1600, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005423234310001135, + "timestamp": "2025-09-30 22:12:08.612759", + "step": 1601, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:08.666779", + "step": 1601, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02317948266863823, + "timestamp": "2025-09-30 22:12:08.668958", + "step": 1602, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:08.722140", + "step": 1602, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01775677688419819, + "timestamp": "2025-09-30 22:12:08.724860", + "step": 1603, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:08.791960", + "step": 1603, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035264005418866873, + "timestamp": "2025-09-30 22:12:08.798017", + "step": 1604, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:08.851106", + "step": 1604, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006642151158303022, + "timestamp": "2025-09-30 22:12:08.853289", + "step": 1605, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:08.906804", + "step": 1605, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026895778253674507, + "timestamp": "2025-09-30 22:12:08.909344", + "step": 1606, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:08.964905", + "step": 1606, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008761496283113956, + "timestamp": "2025-09-30 22:12:08.967384", + "step": 1607, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.020829", + "step": 1607, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011370251886546612, + "timestamp": "2025-09-30 22:12:09.026859", + "step": 1608, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:09.079870", + "step": 1608, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007803045213222504, + "timestamp": "2025-09-30 22:12:09.081910", + "step": 1609, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.135803", + "step": 1609, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02594866044819355, + "timestamp": "2025-09-30 22:12:09.138287", + "step": 1610, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:09.191233", + "step": 1610, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03655938431620598, + "timestamp": "2025-09-30 22:12:09.194673", + "step": 1611, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.250578", + "step": 1611, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005339773837476969, + "timestamp": "2025-09-30 22:12:09.258772", + "step": 1612, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:09.313826", + "step": 1612, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013894207775592804, + "timestamp": "2025-09-30 22:12:09.316505", + "step": 1613, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.370619", + "step": 1613, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005706585478037596, + "timestamp": "2025-09-30 22:12:09.373401", + "step": 1614, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.427451", + "step": 1614, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005220034625381231, + "timestamp": "2025-09-30 22:12:09.430066", + "step": 1615, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.483919", + "step": 1615, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02249130979180336, + "timestamp": "2025-09-30 22:12:09.490894", + "step": 1616, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.545619", + "step": 1616, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024420803412795067, + "timestamp": "2025-09-30 22:12:09.548516", + "step": 1617, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:09.601846", + "step": 1617, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006213244050741196, + "timestamp": "2025-09-30 22:12:09.604007", + "step": 1618, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:09.657294", + "step": 1618, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010518096387386322, + "timestamp": "2025-09-30 22:12:09.660390", + "step": 1619, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:09.715056", + "step": 1619, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016626330092549324, + "timestamp": "2025-09-30 22:12:09.721021", + "step": 1620, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.778310", + "step": 1620, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010907611809670925, + "timestamp": "2025-09-30 22:12:09.780560", + "step": 1621, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:09.835444", + "step": 1621, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010932761244475842, + "timestamp": "2025-09-30 22:12:09.838341", + "step": 1622, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:09.891989", + "step": 1622, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002364618005231023, + "timestamp": "2025-09-30 22:12:09.894614", + "step": 1623, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:09.949021", + "step": 1623, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025081944186240435, + "timestamp": "2025-09-30 22:12:09.955528", + "step": 1624, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.012894", + "step": 1624, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011359497904777527, + "timestamp": "2025-09-30 22:12:10.015406", + "step": 1625, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.068841", + "step": 1625, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0060988375917077065, + "timestamp": "2025-09-30 22:12:10.071461", + "step": 1626, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.127759", + "step": 1626, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05257219076156616, + "timestamp": "2025-09-30 22:12:10.130388", + "step": 1627, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:10.185216", + "step": 1627, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012155899778008461, + "timestamp": "2025-09-30 22:12:10.192809", + "step": 1628, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.248137", + "step": 1628, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01888015680015087, + "timestamp": "2025-09-30 22:12:10.251948", + "step": 1629, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.310545", + "step": 1629, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011508808471262455, + "timestamp": "2025-09-30 22:12:10.314123", + "step": 1630, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.369979", + "step": 1630, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009577545337378979, + "timestamp": "2025-09-30 22:12:10.372814", + "step": 1631, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:10.427544", + "step": 1631, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022420035675168037, + "timestamp": "2025-09-30 22:12:10.433549", + "step": 1632, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.487384", + "step": 1632, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013017824850976467, + "timestamp": "2025-09-30 22:12:10.490027", + "step": 1633, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.543592", + "step": 1633, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006547243800014257, + "timestamp": "2025-09-30 22:12:10.545311", + "step": 1634, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.603149", + "step": 1634, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016644669696688652, + "timestamp": "2025-09-30 22:12:10.604931", + "step": 1635, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.660831", + "step": 1635, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009076499380171299, + "timestamp": "2025-09-30 22:12:10.666106", + "step": 1636, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:10.719634", + "step": 1636, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0293984804302454, + "timestamp": "2025-09-30 22:12:10.721835", + "step": 1637, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.776206", + "step": 1637, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001451183925382793, + "timestamp": "2025-09-30 22:12:10.778315", + "step": 1638, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.832230", + "step": 1638, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002988268854096532, + "timestamp": "2025-09-30 22:12:10.834016", + "step": 1639, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.888355", + "step": 1639, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007970958016812801, + "timestamp": "2025-09-30 22:12:10.893965", + "step": 1640, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:10.947408", + "step": 1640, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026539817452430725, + "timestamp": "2025-09-30 22:12:10.949401", + "step": 1641, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.003469", + "step": 1641, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017281271517276764, + "timestamp": "2025-09-30 22:12:11.005352", + "step": 1642, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:11.067465", + "step": 1642, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008392428047955036, + "timestamp": "2025-09-30 22:12:11.069643", + "step": 1643, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:11.125032", + "step": 1643, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01337872352451086, + "timestamp": "2025-09-30 22:12:11.130293", + "step": 1644, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.183409", + "step": 1644, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005623048637062311, + "timestamp": "2025-09-30 22:12:11.186127", + "step": 1645, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.241311", + "step": 1645, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009559988044202328, + "timestamp": "2025-09-30 22:12:11.243866", + "step": 1646, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.296841", + "step": 1646, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010462108068168163, + "timestamp": "2025-09-30 22:12:11.299819", + "step": 1647, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.354919", + "step": 1647, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008628972806036472, + "timestamp": "2025-09-30 22:12:11.360579", + "step": 1648, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.413579", + "step": 1648, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017333444207906723, + "timestamp": "2025-09-30 22:12:11.415784", + "step": 1649, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:11.473420", + "step": 1649, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007165636867284775, + "timestamp": "2025-09-30 22:12:11.475820", + "step": 1650, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:11.529599", + "step": 1650, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005024661775678396, + "timestamp": "2025-09-30 22:12:11.531962", + "step": 1651, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:11.586225", + "step": 1651, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003687667427584529, + "timestamp": "2025-09-30 22:12:11.591545", + "step": 1652, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:11.644033", + "step": 1652, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01683473400771618, + "timestamp": "2025-09-30 22:12:11.645842", + "step": 1653, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:12.861767", + "step": 1653, + "epoch": 2 + }, + { + "type": "pplx", + "content": 31819411.36838668, + "timestamp": "2025-09-30 22:12:12.875645", + "step": 1653, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:12.927653", + "step": 1653, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019419243559241295, + "timestamp": "2025-09-30 22:12:12.929830", + "step": 1654, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:12.982438", + "step": 1654, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004655472934246063, + "timestamp": "2025-09-30 22:12:12.984630", + "step": 1655, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.038205", + "step": 1655, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0069176494143903255, + "timestamp": "2025-09-30 22:12:13.044005", + "step": 1656, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.096761", + "step": 1656, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0254069771617651, + "timestamp": "2025-09-30 22:12:13.098612", + "step": 1657, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:13.151554", + "step": 1657, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02898487262427807, + "timestamp": "2025-09-30 22:12:13.153772", + "step": 1658, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:13.207387", + "step": 1658, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010060988366603851, + "timestamp": "2025-09-30 22:12:13.210551", + "step": 1659, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.264259", + "step": 1659, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004005379509180784, + "timestamp": "2025-09-30 22:12:13.272363", + "step": 1660, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:13.334659", + "step": 1660, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.043611329048871994, + "timestamp": "2025-09-30 22:12:13.337118", + "step": 1661, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:13.389890", + "step": 1661, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007646446116268635, + "timestamp": "2025-09-30 22:12:13.391929", + "step": 1662, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.445571", + "step": 1662, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008269752375781536, + "timestamp": "2025-09-30 22:12:13.447687", + "step": 1663, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:13.503140", + "step": 1663, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003249566303566098, + "timestamp": "2025-09-30 22:12:13.508527", + "step": 1664, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.561133", + "step": 1664, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010090269148349762, + "timestamp": "2025-09-30 22:12:13.562734", + "step": 1665, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.615328", + "step": 1665, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005579036194831133, + "timestamp": "2025-09-30 22:12:13.616959", + "step": 1666, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:13.670719", + "step": 1666, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023332465440034866, + "timestamp": "2025-09-30 22:12:13.672956", + "step": 1667, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:13.726365", + "step": 1667, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03554206341505051, + "timestamp": "2025-09-30 22:12:13.732007", + "step": 1668, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.785078", + "step": 1668, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.059974852949380875, + "timestamp": "2025-09-30 22:12:13.787293", + "step": 1669, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:13.840026", + "step": 1669, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00815680343657732, + "timestamp": "2025-09-30 22:12:13.842053", + "step": 1670, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:13.897501", + "step": 1670, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013160581700503826, + "timestamp": "2025-09-30 22:12:13.899928", + "step": 1671, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:13.953678", + "step": 1671, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011886881664395332, + "timestamp": "2025-09-30 22:12:13.959315", + "step": 1672, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:14.016600", + "step": 1672, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011096789501607418, + "timestamp": "2025-09-30 22:12:14.018598", + "step": 1673, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.072022", + "step": 1673, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002368275774642825, + "timestamp": "2025-09-30 22:12:14.074112", + "step": 1674, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:14.128129", + "step": 1674, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00044797913869842887, + "timestamp": "2025-09-30 22:12:14.130228", + "step": 1675, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:14.184419", + "step": 1675, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0028583400417119265, + "timestamp": "2025-09-30 22:12:14.190607", + "step": 1676, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.244200", + "step": 1676, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008487415499985218, + "timestamp": "2025-09-30 22:12:14.246938", + "step": 1677, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.299946", + "step": 1677, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00515100359916687, + "timestamp": "2025-09-30 22:12:14.302158", + "step": 1678, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:14.358300", + "step": 1678, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0111812399700284, + "timestamp": "2025-09-30 22:12:14.360428", + "step": 1679, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.413857", + "step": 1679, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018307924270629883, + "timestamp": "2025-09-30 22:12:14.419308", + "step": 1680, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.474616", + "step": 1680, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005157233215868473, + "timestamp": "2025-09-30 22:12:14.476856", + "step": 1681, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.533033", + "step": 1681, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006048670504242182, + "timestamp": "2025-09-30 22:12:14.535241", + "step": 1682, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.589464", + "step": 1682, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005315546877682209, + "timestamp": "2025-09-30 22:12:14.592376", + "step": 1683, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.653180", + "step": 1683, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004861609544605017, + "timestamp": "2025-09-30 22:12:14.658824", + "step": 1684, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.711677", + "step": 1684, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03198011592030525, + "timestamp": "2025-09-30 22:12:14.713847", + "step": 1685, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.766204", + "step": 1685, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006582505535334349, + "timestamp": "2025-09-30 22:12:14.768519", + "step": 1686, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:14.824037", + "step": 1686, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.041787777096033096, + "timestamp": "2025-09-30 22:12:14.826140", + "step": 1687, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:14.884293", + "step": 1687, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05907066911458969, + "timestamp": "2025-09-30 22:12:14.890039", + "step": 1688, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:14.944929", + "step": 1688, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001978454412892461, + "timestamp": "2025-09-30 22:12:14.947025", + "step": 1689, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.002116", + "step": 1689, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.052878353744745255, + "timestamp": "2025-09-30 22:12:15.004343", + "step": 1690, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.059424", + "step": 1690, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005954128573648632, + "timestamp": "2025-09-30 22:12:15.061543", + "step": 1691, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:15.114789", + "step": 1691, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02634171023964882, + "timestamp": "2025-09-30 22:12:15.121913", + "step": 1692, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:15.177756", + "step": 1692, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013122417032718658, + "timestamp": "2025-09-30 22:12:15.179846", + "step": 1693, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:15.234822", + "step": 1693, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008947016671299934, + "timestamp": "2025-09-30 22:12:15.238000", + "step": 1694, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.292395", + "step": 1694, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031593344174325466, + "timestamp": "2025-09-30 22:12:15.295329", + "step": 1695, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:15.350920", + "step": 1695, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05288988724350929, + "timestamp": "2025-09-30 22:12:15.356438", + "step": 1696, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.411357", + "step": 1696, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005359127651900053, + "timestamp": "2025-09-30 22:12:15.413335", + "step": 1697, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.471260", + "step": 1697, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011889653280377388, + "timestamp": "2025-09-30 22:12:15.473427", + "step": 1698, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:15.532562", + "step": 1698, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0044033522717654705, + "timestamp": "2025-09-30 22:12:15.534884", + "step": 1699, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.598256", + "step": 1699, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016879115253686905, + "timestamp": "2025-09-30 22:12:15.604579", + "step": 1700, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:15.657992", + "step": 1700, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02566424012184143, + "timestamp": "2025-09-30 22:12:15.660020", + "step": 1701, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.713446", + "step": 1701, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031795850954949856, + "timestamp": "2025-09-30 22:12:15.715503", + "step": 1702, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:15.769623", + "step": 1702, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003098678309470415, + "timestamp": "2025-09-30 22:12:15.771959", + "step": 1703, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.828493", + "step": 1703, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025204757694154978, + "timestamp": "2025-09-30 22:12:15.833993", + "step": 1704, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.890623", + "step": 1704, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007822325453162193, + "timestamp": "2025-09-30 22:12:15.892874", + "step": 1705, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:15.945573", + "step": 1705, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017046038061380386, + "timestamp": "2025-09-30 22:12:15.947579", + "step": 1706, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:16.015154", + "step": 1706, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030831003095954657, + "timestamp": "2025-09-30 22:12:16.017231", + "step": 1707, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:16.072731", + "step": 1707, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00840392243117094, + "timestamp": "2025-09-30 22:12:16.078330", + "step": 1708, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:16.130875", + "step": 1708, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011422915384173393, + "timestamp": "2025-09-30 22:12:16.133009", + "step": 1709, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:16.189790", + "step": 1709, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004428292624652386, + "timestamp": "2025-09-30 22:12:16.191961", + "step": 1710, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:17.490807", + "step": 1710, + "epoch": 2 + }, + { + "type": "pplx", + "content": 28349924.714862473, + "timestamp": "2025-09-30 22:12:17.492915", + "step": 1710, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.549103", + "step": 1710, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024888822808861732, + "timestamp": "2025-09-30 22:12:17.552103", + "step": 1711, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.607307", + "step": 1711, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022045819088816643, + "timestamp": "2025-09-30 22:12:17.613677", + "step": 1712, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.671421", + "step": 1712, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006401827093213797, + "timestamp": "2025-09-30 22:12:17.673761", + "step": 1713, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.729271", + "step": 1713, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007945166900753975, + "timestamp": "2025-09-30 22:12:17.733538", + "step": 1714, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:17.795084", + "step": 1714, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006832032930105925, + "timestamp": "2025-09-30 22:12:17.797623", + "step": 1715, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.854066", + "step": 1715, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010955681093037128, + "timestamp": "2025-09-30 22:12:17.860042", + "step": 1716, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.913039", + "step": 1716, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025570230558514595, + "timestamp": "2025-09-30 22:12:17.917842", + "step": 1717, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:17.972577", + "step": 1717, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013870848342776299, + "timestamp": "2025-09-30 22:12:17.977653", + "step": 1718, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:18.031960", + "step": 1718, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018312016502022743, + "timestamp": "2025-09-30 22:12:18.034566", + "step": 1719, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.098990", + "step": 1719, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016707230359315872, + "timestamp": "2025-09-30 22:12:18.104901", + "step": 1720, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.169517", + "step": 1720, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010144525207579136, + "timestamp": "2025-09-30 22:12:18.171798", + "step": 1721, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.233785", + "step": 1721, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011039652861654758, + "timestamp": "2025-09-30 22:12:18.237053", + "step": 1722, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.305230", + "step": 1722, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01270805113017559, + "timestamp": "2025-09-30 22:12:18.313241", + "step": 1723, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.368284", + "step": 1723, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00563334533944726, + "timestamp": "2025-09-30 22:12:18.375460", + "step": 1724, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:18.432695", + "step": 1724, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005631479900330305, + "timestamp": "2025-09-30 22:12:18.440294", + "step": 1725, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:18.497729", + "step": 1725, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009919635951519012, + "timestamp": "2025-09-30 22:12:18.505164", + "step": 1726, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:18.560365", + "step": 1726, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023495439440011978, + "timestamp": "2025-09-30 22:12:18.564363", + "step": 1727, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.620110", + "step": 1727, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008099724538624287, + "timestamp": "2025-09-30 22:12:18.626786", + "step": 1728, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.683795", + "step": 1728, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009610223583877087, + "timestamp": "2025-09-30 22:12:18.697005", + "step": 1729, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.753105", + "step": 1729, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009814736433327198, + "timestamp": "2025-09-30 22:12:18.758237", + "step": 1730, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:18.821455", + "step": 1730, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02346763201057911, + "timestamp": "2025-09-30 22:12:18.835026", + "step": 1731, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:18.902723", + "step": 1731, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012323985574766994, + "timestamp": "2025-09-30 22:12:18.909790", + "step": 1732, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:18.965608", + "step": 1732, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010308587923645973, + "timestamp": "2025-09-30 22:12:18.967963", + "step": 1733, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:19.025664", + "step": 1733, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015463477931916714, + "timestamp": "2025-09-30 22:12:19.028111", + "step": 1734, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.082060", + "step": 1734, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013520815409719944, + "timestamp": "2025-09-30 22:12:19.086364", + "step": 1735, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.142929", + "step": 1735, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007343901786953211, + "timestamp": "2025-09-30 22:12:19.149616", + "step": 1736, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.207407", + "step": 1736, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012418312020599842, + "timestamp": "2025-09-30 22:12:19.209506", + "step": 1737, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:19.267575", + "step": 1737, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019974172115325928, + "timestamp": "2025-09-30 22:12:19.270043", + "step": 1738, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:19.329102", + "step": 1738, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005352269392460585, + "timestamp": "2025-09-30 22:12:19.333119", + "step": 1739, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:19.393917", + "step": 1739, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007415872532874346, + "timestamp": "2025-09-30 22:12:19.404366", + "step": 1740, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:19.465342", + "step": 1740, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020012961700558662, + "timestamp": "2025-09-30 22:12:19.467678", + "step": 1741, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:19.526733", + "step": 1741, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011243684217333794, + "timestamp": "2025-09-30 22:12:19.528909", + "step": 1742, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.599075", + "step": 1742, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023677440360188484, + "timestamp": "2025-09-30 22:12:19.601646", + "step": 1743, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:19.655231", + "step": 1743, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023243535310029984, + "timestamp": "2025-09-30 22:12:19.661376", + "step": 1744, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:19.730048", + "step": 1744, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009879850782454014, + "timestamp": "2025-09-30 22:12:19.732022", + "step": 1745, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.786102", + "step": 1745, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006921069230884314, + "timestamp": "2025-09-30 22:12:19.788498", + "step": 1746, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.853434", + "step": 1746, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004081842955201864, + "timestamp": "2025-09-30 22:12:19.855443", + "step": 1747, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.910691", + "step": 1747, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004515796434134245, + "timestamp": "2025-09-30 22:12:19.916269", + "step": 1748, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:19.973698", + "step": 1748, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005187832750380039, + "timestamp": "2025-09-30 22:12:19.975788", + "step": 1749, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.030734", + "step": 1749, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009365570731461048, + "timestamp": "2025-09-30 22:12:20.036962", + "step": 1750, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:20.099843", + "step": 1750, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014311355538666248, + "timestamp": "2025-09-30 22:12:20.101958", + "step": 1751, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:20.163735", + "step": 1751, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038259513676166534, + "timestamp": "2025-09-30 22:12:20.169460", + "step": 1752, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.222660", + "step": 1752, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011391245760023594, + "timestamp": "2025-09-30 22:12:20.224850", + "step": 1753, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:20.282986", + "step": 1753, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03851420059800148, + "timestamp": "2025-09-30 22:12:20.285579", + "step": 1754, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.347897", + "step": 1754, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008675575256347656, + "timestamp": "2025-09-30 22:12:20.349959", + "step": 1755, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.405526", + "step": 1755, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016350556164979935, + "timestamp": "2025-09-30 22:12:20.411169", + "step": 1756, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:20.465248", + "step": 1756, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0020395752508193254, + "timestamp": "2025-09-30 22:12:20.468495", + "step": 1757, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.527627", + "step": 1757, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007115581072866917, + "timestamp": "2025-09-30 22:12:20.530931", + "step": 1758, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.588081", + "step": 1758, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008915326558053493, + "timestamp": "2025-09-30 22:12:20.590168", + "step": 1759, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.648323", + "step": 1759, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01385872345417738, + "timestamp": "2025-09-30 22:12:20.653777", + "step": 1760, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.710857", + "step": 1760, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019434064626693726, + "timestamp": "2025-09-30 22:12:20.712841", + "step": 1761, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:20.768129", + "step": 1761, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001426653703674674, + "timestamp": "2025-09-30 22:12:20.770139", + "step": 1762, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:20.826674", + "step": 1762, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000667016429360956, + "timestamp": "2025-09-30 22:12:20.828901", + "step": 1763, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:20.884903", + "step": 1763, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012331242673099041, + "timestamp": "2025-09-30 22:12:20.891604", + "step": 1764, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:20.946165", + "step": 1764, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008639157749712467, + "timestamp": "2025-09-30 22:12:20.948392", + "step": 1765, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:21.015661", + "step": 1765, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001175249577499926, + "timestamp": "2025-09-30 22:12:21.017767", + "step": 1766, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:21.074161", + "step": 1766, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008622457389719784, + "timestamp": "2025-09-30 22:12:21.076962", + "step": 1767, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:22.508631", + "step": 1767, + "epoch": 2 + }, + { + "type": "pplx", + "content": 30314827.130945917, + "timestamp": "2025-09-30 22:12:22.511215", + "step": 1767, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:22.574951", + "step": 1767, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001509829773567617, + "timestamp": "2025-09-30 22:12:22.581148", + "step": 1768, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:22.653620", + "step": 1768, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011697669513523579, + "timestamp": "2025-09-30 22:12:22.655679", + "step": 1769, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:22.719592", + "step": 1769, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038434225134551525, + "timestamp": "2025-09-30 22:12:22.721767", + "step": 1770, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:22.779999", + "step": 1770, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010291250422596931, + "timestamp": "2025-09-30 22:12:22.782196", + "step": 1771, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:22.855201", + "step": 1771, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00038557566585950553, + "timestamp": "2025-09-30 22:12:22.861700", + "step": 1772, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:22.925889", + "step": 1772, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002135960618034005, + "timestamp": "2025-09-30 22:12:22.928233", + "step": 1773, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:22.983098", + "step": 1773, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022244064137339592, + "timestamp": "2025-09-30 22:12:22.990669", + "step": 1774, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:23.047551", + "step": 1774, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021448533982038498, + "timestamp": "2025-09-30 22:12:23.050681", + "step": 1775, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.126710", + "step": 1775, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005585017614066601, + "timestamp": "2025-09-30 22:12:23.132632", + "step": 1776, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.209022", + "step": 1776, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0050450703129172325, + "timestamp": "2025-09-30 22:12:23.214519", + "step": 1777, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.281367", + "step": 1777, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006257961504161358, + "timestamp": "2025-09-30 22:12:23.284211", + "step": 1778, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:23.358645", + "step": 1778, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0036130433436483145, + "timestamp": "2025-09-30 22:12:23.360773", + "step": 1779, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.429914", + "step": 1779, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0057233721017837524, + "timestamp": "2025-09-30 22:12:23.435998", + "step": 1780, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.492874", + "step": 1780, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001503689563833177, + "timestamp": "2025-09-30 22:12:23.495169", + "step": 1781, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.551669", + "step": 1781, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01165227685123682, + "timestamp": "2025-09-30 22:12:23.571755", + "step": 1782, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:23.661283", + "step": 1782, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008390057482756674, + "timestamp": "2025-09-30 22:12:23.678023", + "step": 1783, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.740549", + "step": 1783, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012339332606643438, + "timestamp": "2025-09-30 22:12:23.756039", + "step": 1784, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.820029", + "step": 1784, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0013973814202472568, + "timestamp": "2025-09-30 22:12:23.828115", + "step": 1785, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:23.902989", + "step": 1785, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0011909554013982415, + "timestamp": "2025-09-30 22:12:23.907395", + "step": 1786, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:23.984498", + "step": 1786, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020810788497328758, + "timestamp": "2025-09-30 22:12:23.989340", + "step": 1787, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.047666", + "step": 1787, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003498237580060959, + "timestamp": "2025-09-30 22:12:24.056749", + "step": 1788, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.128102", + "step": 1788, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0002930622431449592, + "timestamp": "2025-09-30 22:12:24.135350", + "step": 1789, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.208793", + "step": 1789, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038824868388473988, + "timestamp": "2025-09-30 22:12:24.213275", + "step": 1790, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.272697", + "step": 1790, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0029304868075996637, + "timestamp": "2025-09-30 22:12:24.275392", + "step": 1791, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.361640", + "step": 1791, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008040046319365501, + "timestamp": "2025-09-30 22:12:24.373017", + "step": 1792, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.447033", + "step": 1792, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005316526163369417, + "timestamp": "2025-09-30 22:12:24.450440", + "step": 1793, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:24.525425", + "step": 1793, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005331950960680842, + "timestamp": "2025-09-30 22:12:24.532931", + "step": 1794, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.598646", + "step": 1794, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008259565802291036, + "timestamp": "2025-09-30 22:12:24.605028", + "step": 1795, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.665867", + "step": 1795, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005228062160313129, + "timestamp": "2025-09-30 22:12:24.674437", + "step": 1796, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:24.742664", + "step": 1796, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025880755856633186, + "timestamp": "2025-09-30 22:12:24.748436", + "step": 1797, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.806826", + "step": 1797, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04264013096690178, + "timestamp": "2025-09-30 22:12:24.813400", + "step": 1798, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.875741", + "step": 1798, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03723777085542679, + "timestamp": "2025-09-30 22:12:24.881371", + "step": 1799, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:24.943188", + "step": 1799, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028541741892695427, + "timestamp": "2025-09-30 22:12:24.950398", + "step": 1800, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:25.009304", + "step": 1800, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000658934935927391, + "timestamp": "2025-09-30 22:12:25.014125", + "step": 1801, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.076617", + "step": 1801, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00514825526624918, + "timestamp": "2025-09-30 22:12:25.081465", + "step": 1802, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.140816", + "step": 1802, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01359549630433321, + "timestamp": "2025-09-30 22:12:25.144175", + "step": 1803, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:25.209586", + "step": 1803, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003553016809746623, + "timestamp": "2025-09-30 22:12:25.216248", + "step": 1804, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.276635", + "step": 1804, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01718035340309143, + "timestamp": "2025-09-30 22:12:25.280427", + "step": 1805, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.339761", + "step": 1805, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0026898744981735945, + "timestamp": "2025-09-30 22:12:25.343047", + "step": 1806, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.411331", + "step": 1806, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029336009174585342, + "timestamp": "2025-09-30 22:12:25.414050", + "step": 1807, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.476066", + "step": 1807, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0065011694096028805, + "timestamp": "2025-09-30 22:12:25.485199", + "step": 1808, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:25.550418", + "step": 1808, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004351469222456217, + "timestamp": "2025-09-30 22:12:25.556302", + "step": 1809, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:25.623912", + "step": 1809, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009557033888995647, + "timestamp": "2025-09-30 22:12:25.626657", + "step": 1810, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.686205", + "step": 1810, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021093687042593956, + "timestamp": "2025-09-30 22:12:25.692721", + "step": 1811, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.751652", + "step": 1811, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007726198993623257, + "timestamp": "2025-09-30 22:12:25.761738", + "step": 1812, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.826331", + "step": 1812, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006598359905183315, + "timestamp": "2025-09-30 22:12:25.828662", + "step": 1813, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:25.892888", + "step": 1813, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007870987989008427, + "timestamp": "2025-09-30 22:12:25.896860", + "step": 1814, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:25.952583", + "step": 1814, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011960902251303196, + "timestamp": "2025-09-30 22:12:25.959745", + "step": 1815, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:26.032617", + "step": 1815, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002732042223215103, + "timestamp": "2025-09-30 22:12:26.040178", + "step": 1816, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:26.098524", + "step": 1816, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035709214862436056, + "timestamp": "2025-09-30 22:12:26.102122", + "step": 1817, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.162259", + "step": 1817, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03033597581088543, + "timestamp": "2025-09-30 22:12:26.168318", + "step": 1818, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.234789", + "step": 1818, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04264497384428978, + "timestamp": "2025-09-30 22:12:26.242009", + "step": 1819, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.300287", + "step": 1819, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008137590251863003, + "timestamp": "2025-09-30 22:12:26.307528", + "step": 1820, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.374224", + "step": 1820, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004997264593839645, + "timestamp": "2025-09-30 22:12:26.384862", + "step": 1821, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.448877", + "step": 1821, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0041570719331502914, + "timestamp": "2025-09-30 22:12:26.453045", + "step": 1822, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:26.526430", + "step": 1822, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004612022079527378, + "timestamp": "2025-09-30 22:12:26.530133", + "step": 1823, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:26.591520", + "step": 1823, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024884754791855812, + "timestamp": "2025-09-30 22:12:26.598978", + "step": 1824, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:28.095332", + "step": 1824, + "epoch": 2 + }, + { + "type": "pplx", + "content": 30672189.36827617, + "timestamp": "2025-09-30 22:12:28.100771", + "step": 1824, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.170601", + "step": 1824, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00739399716258049, + "timestamp": "2025-09-30 22:12:28.173632", + "step": 1825, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.241236", + "step": 1825, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01240342017263174, + "timestamp": "2025-09-30 22:12:28.251710", + "step": 1826, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:28.317530", + "step": 1826, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024557800497859716, + "timestamp": "2025-09-30 22:12:28.328228", + "step": 1827, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:28.386913", + "step": 1827, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013910613022744656, + "timestamp": "2025-09-30 22:12:28.394292", + "step": 1828, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:28.450865", + "step": 1828, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001578363822773099, + "timestamp": "2025-09-30 22:12:28.453690", + "step": 1829, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.511246", + "step": 1829, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04201820492744446, + "timestamp": "2025-09-30 22:12:28.517272", + "step": 1830, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:28.576318", + "step": 1830, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008280682959593832, + "timestamp": "2025-09-30 22:12:28.578521", + "step": 1831, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.649280", + "step": 1831, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006336410064250231, + "timestamp": "2025-09-30 22:12:28.661730", + "step": 1832, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.742433", + "step": 1832, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005034334491938353, + "timestamp": "2025-09-30 22:12:28.746912", + "step": 1833, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:28.836112", + "step": 1833, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013774161227047443, + "timestamp": "2025-09-30 22:12:28.839488", + "step": 1834, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:28.905166", + "step": 1834, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06970737874507904, + "timestamp": "2025-09-30 22:12:28.913773", + "step": 1835, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:28.974737", + "step": 1835, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.053864989429712296, + "timestamp": "2025-09-30 22:12:28.983214", + "step": 1836, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.050549", + "step": 1836, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0217081718146801, + "timestamp": "2025-09-30 22:12:29.052793", + "step": 1837, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.121315", + "step": 1837, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013128918595612049, + "timestamp": "2025-09-30 22:12:29.123710", + "step": 1838, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.188599", + "step": 1838, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.057358644902706146, + "timestamp": "2025-09-30 22:12:29.191407", + "step": 1839, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.249536", + "step": 1839, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03622815012931824, + "timestamp": "2025-09-30 22:12:29.255911", + "step": 1840, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:29.323306", + "step": 1840, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027150630950927734, + "timestamp": "2025-09-30 22:12:29.325939", + "step": 1841, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.391564", + "step": 1841, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002653220435604453, + "timestamp": "2025-09-30 22:12:29.398259", + "step": 1842, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.482420", + "step": 1842, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013348475331440568, + "timestamp": "2025-09-30 22:12:29.485991", + "step": 1843, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.544185", + "step": 1843, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029313331469893456, + "timestamp": "2025-09-30 22:12:29.549874", + "step": 1844, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.621496", + "step": 1844, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007896478287875652, + "timestamp": "2025-09-30 22:12:29.625555", + "step": 1845, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.689604", + "step": 1845, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026143400464206934, + "timestamp": "2025-09-30 22:12:29.696347", + "step": 1846, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.758025", + "step": 1846, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01978653483092785, + "timestamp": "2025-09-30 22:12:29.761214", + "step": 1847, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.827132", + "step": 1847, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028982680290937424, + "timestamp": "2025-09-30 22:12:29.833733", + "step": 1848, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.893636", + "step": 1848, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03256614878773689, + "timestamp": "2025-09-30 22:12:29.898393", + "step": 1849, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:29.963923", + "step": 1849, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010735915042459965, + "timestamp": "2025-09-30 22:12:29.967708", + "step": 1850, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.025959", + "step": 1850, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011695819906890392, + "timestamp": "2025-09-30 22:12:30.029295", + "step": 1851, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.084003", + "step": 1851, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006875795312225819, + "timestamp": "2025-09-30 22:12:30.093480", + "step": 1852, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:30.151820", + "step": 1852, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014828616753220558, + "timestamp": "2025-09-30 22:12:30.157422", + "step": 1853, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.218984", + "step": 1853, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010935215279459953, + "timestamp": "2025-09-30 22:12:30.222139", + "step": 1854, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:30.280694", + "step": 1854, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011751067824661732, + "timestamp": "2025-09-30 22:12:30.283823", + "step": 1855, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.342542", + "step": 1855, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03328597545623779, + "timestamp": "2025-09-30 22:12:30.349244", + "step": 1856, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.427820", + "step": 1856, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015032247640192509, + "timestamp": "2025-09-30 22:12:30.431664", + "step": 1857, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.487533", + "step": 1857, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014371916651725769, + "timestamp": "2025-09-30 22:12:30.495788", + "step": 1858, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.578734", + "step": 1858, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010754152201116085, + "timestamp": "2025-09-30 22:12:30.584875", + "step": 1859, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:30.647369", + "step": 1859, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01473289541900158, + "timestamp": "2025-09-30 22:12:30.654052", + "step": 1860, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:30.712046", + "step": 1860, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008067458868026733, + "timestamp": "2025-09-30 22:12:30.714880", + "step": 1861, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.774679", + "step": 1861, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02813909575343132, + "timestamp": "2025-09-30 22:12:30.779834", + "step": 1862, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.841083", + "step": 1862, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008334296755492687, + "timestamp": "2025-09-30 22:12:30.852524", + "step": 1863, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.913530", + "step": 1863, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015859607607126236, + "timestamp": "2025-09-30 22:12:30.920050", + "step": 1864, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:30.976590", + "step": 1864, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017760148271918297, + "timestamp": "2025-09-30 22:12:30.982243", + "step": 1865, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.053340", + "step": 1865, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024972213432192802, + "timestamp": "2025-09-30 22:12:31.056911", + "step": 1866, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.125418", + "step": 1866, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007451011333614588, + "timestamp": "2025-09-30 22:12:31.128354", + "step": 1867, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.185780", + "step": 1867, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023097878322005272, + "timestamp": "2025-09-30 22:12:31.191973", + "step": 1868, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.247075", + "step": 1868, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005390969570726156, + "timestamp": "2025-09-30 22:12:31.250345", + "step": 1869, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.307936", + "step": 1869, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014087321236729622, + "timestamp": "2025-09-30 22:12:31.322415", + "step": 1870, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.385612", + "step": 1870, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018397578969597816, + "timestamp": "2025-09-30 22:12:31.390764", + "step": 1871, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.462449", + "step": 1871, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007025882601737976, + "timestamp": "2025-09-30 22:12:31.480345", + "step": 1872, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:31.536704", + "step": 1872, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011556877754628658, + "timestamp": "2025-09-30 22:12:31.541459", + "step": 1873, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.605457", + "step": 1873, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01839577779173851, + "timestamp": "2025-09-30 22:12:31.608900", + "step": 1874, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.680668", + "step": 1874, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010417903773486614, + "timestamp": "2025-09-30 22:12:31.690146", + "step": 1875, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:31.744977", + "step": 1875, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009947366081178188, + "timestamp": "2025-09-30 22:12:31.751963", + "step": 1876, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.820158", + "step": 1876, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012040347792208195, + "timestamp": "2025-09-30 22:12:31.822535", + "step": 1877, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:31.890594", + "step": 1877, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012225349200889468, + "timestamp": "2025-09-30 22:12:31.894112", + "step": 1878, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:31.949728", + "step": 1878, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006086386274546385, + "timestamp": "2025-09-30 22:12:31.953732", + "step": 1879, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:32.009116", + "step": 1879, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00934376660734415, + "timestamp": "2025-09-30 22:12:32.016702", + "step": 1880, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:32.071527", + "step": 1880, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009230917319655418, + "timestamp": "2025-09-30 22:12:32.073939", + "step": 1881, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:33.529057", + "step": 1881, + "epoch": 3 + }, + { + "type": "pplx", + "content": 29811204.71677049, + "timestamp": "2025-09-30 22:12:33.532880", + "step": 1881, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:33.589718", + "step": 1881, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002237612148746848, + "timestamp": "2025-09-30 22:12:33.597449", + "step": 1882, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:33.654864", + "step": 1882, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021926003973931074, + "timestamp": "2025-09-30 22:12:33.657646", + "step": 1883, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:33.719097", + "step": 1883, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007021576166152954, + "timestamp": "2025-09-30 22:12:33.725938", + "step": 1884, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:33.787887", + "step": 1884, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002189788268879056, + "timestamp": "2025-09-30 22:12:33.791754", + "step": 1885, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:33.849419", + "step": 1885, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0325036458671093, + "timestamp": "2025-09-30 22:12:33.852113", + "step": 1886, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:33.912628", + "step": 1886, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02236909046769142, + "timestamp": "2025-09-30 22:12:33.915640", + "step": 1887, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:33.973272", + "step": 1887, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018104365095496178, + "timestamp": "2025-09-30 22:12:33.979563", + "step": 1888, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.034551", + "step": 1888, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01875036023557186, + "timestamp": "2025-09-30 22:12:34.037045", + "step": 1889, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.095906", + "step": 1889, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010548067279160023, + "timestamp": "2025-09-30 22:12:34.098658", + "step": 1890, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.155832", + "step": 1890, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007385232951492071, + "timestamp": "2025-09-30 22:12:34.159492", + "step": 1891, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.214578", + "step": 1891, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00859599094837904, + "timestamp": "2025-09-30 22:12:34.226724", + "step": 1892, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.287077", + "step": 1892, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011528218165040016, + "timestamp": "2025-09-30 22:12:34.289393", + "step": 1893, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.344245", + "step": 1893, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03839213401079178, + "timestamp": "2025-09-30 22:12:34.347664", + "step": 1894, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.413440", + "step": 1894, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004163206089287996, + "timestamp": "2025-09-30 22:12:34.420353", + "step": 1895, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.476133", + "step": 1895, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006831273436546326, + "timestamp": "2025-09-30 22:12:34.482107", + "step": 1896, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.550831", + "step": 1896, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011001095408573747, + "timestamp": "2025-09-30 22:12:34.553869", + "step": 1897, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.613063", + "step": 1897, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002597188577055931, + "timestamp": "2025-09-30 22:12:34.619835", + "step": 1898, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.678929", + "step": 1898, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003021536162123084, + "timestamp": "2025-09-30 22:12:34.681965", + "step": 1899, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.736489", + "step": 1899, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.035559866577386856, + "timestamp": "2025-09-30 22:12:34.747555", + "step": 1900, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.808191", + "step": 1900, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00288590369746089, + "timestamp": "2025-09-30 22:12:34.812074", + "step": 1901, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:34.875165", + "step": 1901, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010649462230503559, + "timestamp": "2025-09-30 22:12:34.877318", + "step": 1902, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:34.938011", + "step": 1902, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020891645923256874, + "timestamp": "2025-09-30 22:12:34.946875", + "step": 1903, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.007218", + "step": 1903, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.034685712307691574, + "timestamp": "2025-09-30 22:12:35.019492", + "step": 1904, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.075150", + "step": 1904, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015056979609653354, + "timestamp": "2025-09-30 22:12:35.078579", + "step": 1905, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.133854", + "step": 1905, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.036593399941921234, + "timestamp": "2025-09-30 22:12:35.137590", + "step": 1906, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:35.193504", + "step": 1906, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018256593029946089, + "timestamp": "2025-09-30 22:12:35.196477", + "step": 1907, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.255154", + "step": 1907, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04090496152639389, + "timestamp": "2025-09-30 22:12:35.261202", + "step": 1908, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.317711", + "step": 1908, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014386068098247051, + "timestamp": "2025-09-30 22:12:35.321688", + "step": 1909, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:35.386201", + "step": 1909, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018922999501228333, + "timestamp": "2025-09-30 22:12:35.390261", + "step": 1910, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.449885", + "step": 1910, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015430129133164883, + "timestamp": "2025-09-30 22:12:35.456042", + "step": 1911, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.516289", + "step": 1911, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014453909359872341, + "timestamp": "2025-09-30 22:12:35.522502", + "step": 1912, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.583529", + "step": 1912, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019513025879859924, + "timestamp": "2025-09-30 22:12:35.586244", + "step": 1913, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:35.644204", + "step": 1913, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011197819374501705, + "timestamp": "2025-09-30 22:12:35.659235", + "step": 1914, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.718192", + "step": 1914, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016041845083236694, + "timestamp": "2025-09-30 22:12:35.720928", + "step": 1915, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.776597", + "step": 1915, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008869746699929237, + "timestamp": "2025-09-30 22:12:35.786518", + "step": 1916, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.843545", + "step": 1916, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009099365212023258, + "timestamp": "2025-09-30 22:12:35.847455", + "step": 1917, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.904360", + "step": 1917, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014621244743466377, + "timestamp": "2025-09-30 22:12:35.909624", + "step": 1918, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:35.965569", + "step": 1918, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009779931046068668, + "timestamp": "2025-09-30 22:12:35.981765", + "step": 1919, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.046589", + "step": 1919, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016505222767591476, + "timestamp": "2025-09-30 22:12:36.060991", + "step": 1920, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.125986", + "step": 1920, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006841294001787901, + "timestamp": "2025-09-30 22:12:36.129710", + "step": 1921, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.189066", + "step": 1921, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009847632609307766, + "timestamp": "2025-09-30 22:12:36.195855", + "step": 1922, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.252944", + "step": 1922, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02639087848365307, + "timestamp": "2025-09-30 22:12:36.258090", + "step": 1923, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:36.319740", + "step": 1923, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01638978160917759, + "timestamp": "2025-09-30 22:12:36.326755", + "step": 1924, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.386575", + "step": 1924, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028748007491230965, + "timestamp": "2025-09-30 22:12:36.390353", + "step": 1925, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:36.444542", + "step": 1925, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018612733110785484, + "timestamp": "2025-09-30 22:12:36.447012", + "step": 1926, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.501961", + "step": 1926, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004065474960952997, + "timestamp": "2025-09-30 22:12:36.508877", + "step": 1927, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.565538", + "step": 1927, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006613335572183132, + "timestamp": "2025-09-30 22:12:36.571650", + "step": 1928, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.625346", + "step": 1928, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024656997993588448, + "timestamp": "2025-09-30 22:12:36.628060", + "step": 1929, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.686966", + "step": 1929, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04802418872714043, + "timestamp": "2025-09-30 22:12:36.689252", + "step": 1930, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.742692", + "step": 1930, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021498506888747215, + "timestamp": "2025-09-30 22:12:36.745190", + "step": 1931, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.799740", + "step": 1931, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008578136563301086, + "timestamp": "2025-09-30 22:12:36.805549", + "step": 1932, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.861069", + "step": 1932, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029331946279853582, + "timestamp": "2025-09-30 22:12:36.863376", + "step": 1933, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.925739", + "step": 1933, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01277841441333294, + "timestamp": "2025-09-30 22:12:36.928011", + "step": 1934, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:36.989085", + "step": 1934, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008462684229016304, + "timestamp": "2025-09-30 22:12:36.992617", + "step": 1935, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:37.058882", + "step": 1935, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00633528595790267, + "timestamp": "2025-09-30 22:12:37.064520", + "step": 1936, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:37.126025", + "step": 1936, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009705213829874992, + "timestamp": "2025-09-30 22:12:37.128017", + "step": 1937, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:37.183503", + "step": 1937, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009753524325788021, + "timestamp": "2025-09-30 22:12:37.187696", + "step": 1938, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:38.512858", + "step": 1938, + "epoch": 3 + }, + { + "type": "pplx", + "content": 29558369.15336448, + "timestamp": "2025-09-30 22:12:38.514909", + "step": 1938, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:38.567034", + "step": 1938, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025362467393279076, + "timestamp": "2025-09-30 22:12:38.571280", + "step": 1939, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:38.646596", + "step": 1939, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004295314662158489, + "timestamp": "2025-09-30 22:12:38.654788", + "step": 1940, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:38.717141", + "step": 1940, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038865043316036463, + "timestamp": "2025-09-30 22:12:38.723450", + "step": 1941, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:38.785516", + "step": 1941, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016797490417957306, + "timestamp": "2025-09-30 22:12:38.790002", + "step": 1942, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:38.845954", + "step": 1942, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031100988388061523, + "timestamp": "2025-09-30 22:12:38.850849", + "step": 1943, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:38.924658", + "step": 1943, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0393851213157177, + "timestamp": "2025-09-30 22:12:38.935303", + "step": 1944, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:38.996063", + "step": 1944, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005985604133456945, + "timestamp": "2025-09-30 22:12:39.001238", + "step": 1945, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.058690", + "step": 1945, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008343399502336979, + "timestamp": "2025-09-30 22:12:39.064100", + "step": 1946, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:39.120940", + "step": 1946, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003788830479606986, + "timestamp": "2025-09-30 22:12:39.126550", + "step": 1947, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.185776", + "step": 1947, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005876143928617239, + "timestamp": "2025-09-30 22:12:39.194044", + "step": 1948, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:39.247405", + "step": 1948, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009399537928402424, + "timestamp": "2025-09-30 22:12:39.252551", + "step": 1949, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:39.317976", + "step": 1949, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053526838310062885, + "timestamp": "2025-09-30 22:12:39.321738", + "step": 1950, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:39.384587", + "step": 1950, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021824544295668602, + "timestamp": "2025-09-30 22:12:39.389592", + "step": 1951, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.448210", + "step": 1951, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016011234372854233, + "timestamp": "2025-09-30 22:12:39.457427", + "step": 1952, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:39.515827", + "step": 1952, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025454445276409388, + "timestamp": "2025-09-30 22:12:39.522667", + "step": 1953, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:39.580991", + "step": 1953, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003802164224907756, + "timestamp": "2025-09-30 22:12:39.584553", + "step": 1954, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.644276", + "step": 1954, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03186912089586258, + "timestamp": "2025-09-30 22:12:39.646374", + "step": 1955, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:39.704066", + "step": 1955, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023978808894753456, + "timestamp": "2025-09-30 22:12:39.710510", + "step": 1956, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:39.767903", + "step": 1956, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01542146410793066, + "timestamp": "2025-09-30 22:12:39.770627", + "step": 1957, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.824884", + "step": 1957, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009438499808311462, + "timestamp": "2025-09-30 22:12:39.830557", + "step": 1958, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.900535", + "step": 1958, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00540934456512332, + "timestamp": "2025-09-30 22:12:39.903028", + "step": 1959, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:39.958455", + "step": 1959, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.034282151609659195, + "timestamp": "2025-09-30 22:12:39.964357", + "step": 1960, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:40.020785", + "step": 1960, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006182772573083639, + "timestamp": "2025-09-30 22:12:40.022932", + "step": 1961, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.077532", + "step": 1961, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004338286817073822, + "timestamp": "2025-09-30 22:12:40.079805", + "step": 1962, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.134069", + "step": 1962, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012008560821413994, + "timestamp": "2025-09-30 22:12:40.136462", + "step": 1963, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.217923", + "step": 1963, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03428441658616066, + "timestamp": "2025-09-30 22:12:40.223659", + "step": 1964, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.278527", + "step": 1964, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008386987261474133, + "timestamp": "2025-09-30 22:12:40.280482", + "step": 1965, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.340067", + "step": 1965, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035431894939392805, + "timestamp": "2025-09-30 22:12:40.352999", + "step": 1966, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.426650", + "step": 1966, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005195711273699999, + "timestamp": "2025-09-30 22:12:40.428878", + "step": 1967, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.483244", + "step": 1967, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014768867753446102, + "timestamp": "2025-09-30 22:12:40.489131", + "step": 1968, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:40.541727", + "step": 1968, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026658920105546713, + "timestamp": "2025-09-30 22:12:40.544813", + "step": 1969, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.605167", + "step": 1969, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00781507883220911, + "timestamp": "2025-09-30 22:12:40.607386", + "step": 1970, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:40.671135", + "step": 1970, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012213180772960186, + "timestamp": "2025-09-30 22:12:40.672951", + "step": 1971, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.731179", + "step": 1971, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030378760769963264, + "timestamp": "2025-09-30 22:12:40.739563", + "step": 1972, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:40.796297", + "step": 1972, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014325362630188465, + "timestamp": "2025-09-30 22:12:40.800560", + "step": 1973, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.865717", + "step": 1973, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006878285203129053, + "timestamp": "2025-09-30 22:12:40.871537", + "step": 1974, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:40.943601", + "step": 1974, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019338076934218407, + "timestamp": "2025-09-30 22:12:40.960489", + "step": 1975, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.020679", + "step": 1975, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013357887975871563, + "timestamp": "2025-09-30 22:12:41.029453", + "step": 1976, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.107755", + "step": 1976, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00660253269597888, + "timestamp": "2025-09-30 22:12:41.126488", + "step": 1977, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:41.203228", + "step": 1977, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006124368868768215, + "timestamp": "2025-09-30 22:12:41.221345", + "step": 1978, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:41.292381", + "step": 1978, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00962238758802414, + "timestamp": "2025-09-30 22:12:41.299224", + "step": 1979, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:41.357369", + "step": 1979, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002619996899738908, + "timestamp": "2025-09-30 22:12:41.379860", + "step": 1980, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.448904", + "step": 1980, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008656610734760761, + "timestamp": "2025-09-30 22:12:41.452895", + "step": 1981, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.515103", + "step": 1981, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028397956863045692, + "timestamp": "2025-09-30 22:12:41.519895", + "step": 1982, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:41.586905", + "step": 1982, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02610507234930992, + "timestamp": "2025-09-30 22:12:41.590858", + "step": 1983, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.654054", + "step": 1983, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018599865958094597, + "timestamp": "2025-09-30 22:12:41.671758", + "step": 1984, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.737134", + "step": 1984, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005543197970837355, + "timestamp": "2025-09-30 22:12:41.742183", + "step": 1985, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:41.813581", + "step": 1985, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002317222999408841, + "timestamp": "2025-09-30 22:12:41.815633", + "step": 1986, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:41.881426", + "step": 1986, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014996451325714588, + "timestamp": "2025-09-30 22:12:41.883416", + "step": 1987, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:41.937781", + "step": 1987, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017824586480855942, + "timestamp": "2025-09-30 22:12:41.943617", + "step": 1988, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.001541", + "step": 1988, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026619719341397285, + "timestamp": "2025-09-30 22:12:42.004640", + "step": 1989, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.068935", + "step": 1989, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035347731318324804, + "timestamp": "2025-09-30 22:12:42.071286", + "step": 1990, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.126117", + "step": 1990, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03799136355519295, + "timestamp": "2025-09-30 22:12:42.128517", + "step": 1991, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.184013", + "step": 1991, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04470697417855263, + "timestamp": "2025-09-30 22:12:42.189722", + "step": 1992, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:42.244769", + "step": 1992, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05278744548559189, + "timestamp": "2025-09-30 22:12:42.249219", + "step": 1993, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.315365", + "step": 1993, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01737930439412594, + "timestamp": "2025-09-30 22:12:42.318475", + "step": 1994, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:42.383369", + "step": 1994, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01412777416408062, + "timestamp": "2025-09-30 22:12:42.385937", + "step": 1995, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:43.827522", + "step": 1995, + "epoch": 3 + }, + { + "type": "pplx", + "content": 30986920.64344966, + "timestamp": "2025-09-30 22:12:43.836901", + "step": 1995, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:43.896684", + "step": 1995, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007152962498366833, + "timestamp": "2025-09-30 22:12:43.904023", + "step": 1996, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:43.962205", + "step": 1996, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008768231607973576, + "timestamp": "2025-09-30 22:12:43.965930", + "step": 1997, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:44.024945", + "step": 1997, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014288007281720638, + "timestamp": "2025-09-30 22:12:44.028859", + "step": 1998, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:44.086462", + "step": 1998, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025513725355267525, + "timestamp": "2025-09-30 22:12:44.090556", + "step": 1999, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:44.145220", + "step": 1999, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022684959694743156, + "timestamp": "2025-09-30 22:12:44.152892", + "step": 2000, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2000", + "timestamp": "2025-09-30 22:12:44.704758", + "step": 2000, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:44.762348", + "step": 2000, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01981218159198761, + "timestamp": "2025-09-30 22:12:44.765533", + "step": 2001, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:44.837870", + "step": 2001, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006580715533345938, + "timestamp": "2025-09-30 22:12:44.840899", + "step": 2002, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:44.910869", + "step": 2002, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007349673192948103, + "timestamp": "2025-09-30 22:12:44.924598", + "step": 2003, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:44.990101", + "step": 2003, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03402791544795036, + "timestamp": "2025-09-30 22:12:44.996862", + "step": 2004, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.064039", + "step": 2004, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037932060658931732, + "timestamp": "2025-09-30 22:12:45.073329", + "step": 2005, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:45.132903", + "step": 2005, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03237783536314964, + "timestamp": "2025-09-30 22:12:45.135893", + "step": 2006, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.204595", + "step": 2006, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012929347343742847, + "timestamp": "2025-09-30 22:12:45.207303", + "step": 2007, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.265577", + "step": 2007, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01525042299181223, + "timestamp": "2025-09-30 22:12:45.277214", + "step": 2008, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.338311", + "step": 2008, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018445158377289772, + "timestamp": "2025-09-30 22:12:45.342291", + "step": 2009, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:45.406350", + "step": 2009, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023437367752194405, + "timestamp": "2025-09-30 22:12:45.416485", + "step": 2010, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.478091", + "step": 2010, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022062715142965317, + "timestamp": "2025-09-30 22:12:45.487759", + "step": 2011, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.555782", + "step": 2011, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022687969729304314, + "timestamp": "2025-09-30 22:12:45.562458", + "step": 2012, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.629028", + "step": 2012, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007122944109141827, + "timestamp": "2025-09-30 22:12:45.631721", + "step": 2013, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.688904", + "step": 2013, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007444100920110941, + "timestamp": "2025-09-30 22:12:45.692505", + "step": 2014, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.747857", + "step": 2014, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02668391726911068, + "timestamp": "2025-09-30 22:12:45.757716", + "step": 2015, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.824072", + "step": 2015, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01846177875995636, + "timestamp": "2025-09-30 22:12:45.830050", + "step": 2016, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:45.888760", + "step": 2016, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013639253564178944, + "timestamp": "2025-09-30 22:12:45.898113", + "step": 2017, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:45.960139", + "step": 2017, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01387722697108984, + "timestamp": "2025-09-30 22:12:45.968363", + "step": 2018, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:46.026652", + "step": 2018, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00344693916849792, + "timestamp": "2025-09-30 22:12:46.036303", + "step": 2019, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.092962", + "step": 2019, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010282598435878754, + "timestamp": "2025-09-30 22:12:46.101409", + "step": 2020, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.156391", + "step": 2020, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006298901047557592, + "timestamp": "2025-09-30 22:12:46.159534", + "step": 2021, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.213661", + "step": 2021, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008653373457491398, + "timestamp": "2025-09-30 22:12:46.217113", + "step": 2022, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.271623", + "step": 2022, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037791277281939983, + "timestamp": "2025-09-30 22:12:46.276546", + "step": 2023, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.338579", + "step": 2023, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014515328221023083, + "timestamp": "2025-09-30 22:12:46.351822", + "step": 2024, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.405559", + "step": 2024, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013266796246170998, + "timestamp": "2025-09-30 22:12:46.416235", + "step": 2025, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:46.474075", + "step": 2025, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002332469215616584, + "timestamp": "2025-09-30 22:12:46.483229", + "step": 2026, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.547920", + "step": 2026, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00418002950027585, + "timestamp": "2025-09-30 22:12:46.559265", + "step": 2027, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.614126", + "step": 2027, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004884445574134588, + "timestamp": "2025-09-30 22:12:46.621285", + "step": 2028, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.683577", + "step": 2028, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01921374723315239, + "timestamp": "2025-09-30 22:12:46.687247", + "step": 2029, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:46.754441", + "step": 2029, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01416950486600399, + "timestamp": "2025-09-30 22:12:46.764603", + "step": 2030, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.826453", + "step": 2030, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004991916473954916, + "timestamp": "2025-09-30 22:12:46.829235", + "step": 2031, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:46.891979", + "step": 2031, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00923225563019514, + "timestamp": "2025-09-30 22:12:46.899939", + "step": 2032, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:46.960726", + "step": 2032, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029489969834685326, + "timestamp": "2025-09-30 22:12:46.963458", + "step": 2033, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.025987", + "step": 2033, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03119957633316517, + "timestamp": "2025-09-30 22:12:47.036326", + "step": 2034, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:47.092090", + "step": 2034, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013563781045377254, + "timestamp": "2025-09-30 22:12:47.103675", + "step": 2035, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.172647", + "step": 2035, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013546071946620941, + "timestamp": "2025-09-30 22:12:47.179641", + "step": 2036, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.235262", + "step": 2036, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013057815842330456, + "timestamp": "2025-09-30 22:12:47.237721", + "step": 2037, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:47.304263", + "step": 2037, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028074579313397408, + "timestamp": "2025-09-30 22:12:47.307359", + "step": 2038, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:47.365018", + "step": 2038, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029875219333916903, + "timestamp": "2025-09-30 22:12:47.376522", + "step": 2039, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.440639", + "step": 2039, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002364709507673979, + "timestamp": "2025-09-30 22:12:47.453613", + "step": 2040, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.519200", + "step": 2040, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01649676077067852, + "timestamp": "2025-09-30 22:12:47.528985", + "step": 2041, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.591132", + "step": 2041, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018646756652742624, + "timestamp": "2025-09-30 22:12:47.594259", + "step": 2042, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.657779", + "step": 2042, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018227288499474525, + "timestamp": "2025-09-30 22:12:47.662542", + "step": 2043, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.725112", + "step": 2043, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02195359393954277, + "timestamp": "2025-09-30 22:12:47.739489", + "step": 2044, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.807304", + "step": 2044, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00557445315644145, + "timestamp": "2025-09-30 22:12:47.811883", + "step": 2045, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.868755", + "step": 2045, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04431688040494919, + "timestamp": "2025-09-30 22:12:47.873901", + "step": 2046, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:47.937423", + "step": 2046, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018206002190709114, + "timestamp": "2025-09-30 22:12:47.947104", + "step": 2047, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:48.006345", + "step": 2047, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006048198323696852, + "timestamp": "2025-09-30 22:12:48.024417", + "step": 2048, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:48.092028", + "step": 2048, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004360709339380264, + "timestamp": "2025-09-30 22:12:48.107431", + "step": 2049, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:48.174773", + "step": 2049, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011058392934501171, + "timestamp": "2025-09-30 22:12:48.187116", + "step": 2050, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:48.242660", + "step": 2050, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00571943586692214, + "timestamp": "2025-09-30 22:12:48.249247", + "step": 2051, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:48.310667", + "step": 2051, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026921484619379044, + "timestamp": "2025-09-30 22:12:48.325873", + "step": 2052, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:49.721444", + "step": 2052, + "epoch": 3 + }, + { + "type": "pplx", + "content": 31591669.075829722, + "timestamp": "2025-09-30 22:12:49.725309", + "step": 2052, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:49.784999", + "step": 2052, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007747288327664137, + "timestamp": "2025-09-30 22:12:49.797589", + "step": 2053, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:49.852580", + "step": 2053, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032807444222271442, + "timestamp": "2025-09-30 22:12:49.855656", + "step": 2054, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:49.917143", + "step": 2054, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013838792219758034, + "timestamp": "2025-09-30 22:12:49.931141", + "step": 2055, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:49.987250", + "step": 2055, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002472275635227561, + "timestamp": "2025-09-30 22:12:49.995095", + "step": 2056, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.050210", + "step": 2056, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01641083136200905, + "timestamp": "2025-09-30 22:12:50.061765", + "step": 2057, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.122990", + "step": 2057, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00696180434897542, + "timestamp": "2025-09-30 22:12:50.126640", + "step": 2058, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.183835", + "step": 2058, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006303395610302687, + "timestamp": "2025-09-30 22:12:50.196071", + "step": 2059, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.252924", + "step": 2059, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011265805922448635, + "timestamp": "2025-09-30 22:12:50.267874", + "step": 2060, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.326364", + "step": 2060, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01980595663189888, + "timestamp": "2025-09-30 22:12:50.341639", + "step": 2061, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:50.400756", + "step": 2061, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002819489687681198, + "timestamp": "2025-09-30 22:12:50.406844", + "step": 2062, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:50.463800", + "step": 2062, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007079659961163998, + "timestamp": "2025-09-30 22:12:50.467330", + "step": 2063, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.524689", + "step": 2063, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009019630961120129, + "timestamp": "2025-09-30 22:12:50.533155", + "step": 2064, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.603580", + "step": 2064, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0046064346097409725, + "timestamp": "2025-09-30 22:12:50.606342", + "step": 2065, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.662393", + "step": 2065, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025710809975862503, + "timestamp": "2025-09-30 22:12:50.675290", + "step": 2066, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:50.739641", + "step": 2066, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01167394407093525, + "timestamp": "2025-09-30 22:12:50.742662", + "step": 2067, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.805429", + "step": 2067, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021902693435549736, + "timestamp": "2025-09-30 22:12:50.812165", + "step": 2068, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.871441", + "step": 2068, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004060753621160984, + "timestamp": "2025-09-30 22:12:50.874617", + "step": 2069, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:50.929615", + "step": 2069, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023838216438889503, + "timestamp": "2025-09-30 22:12:50.933346", + "step": 2070, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:50.988296", + "step": 2070, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000567333830986172, + "timestamp": "2025-09-30 22:12:50.991262", + "step": 2071, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.048304", + "step": 2071, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.033882711082696915, + "timestamp": "2025-09-30 22:12:51.054912", + "step": 2072, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.125544", + "step": 2072, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01983877643942833, + "timestamp": "2025-09-30 22:12:51.128731", + "step": 2073, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.183987", + "step": 2073, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012886099517345428, + "timestamp": "2025-09-30 22:12:51.192788", + "step": 2074, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:51.255083", + "step": 2074, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005958546884357929, + "timestamp": "2025-09-30 22:12:51.257289", + "step": 2075, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.325526", + "step": 2075, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005503936670720577, + "timestamp": "2025-09-30 22:12:51.333228", + "step": 2076, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.387290", + "step": 2076, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0063980803824961185, + "timestamp": "2025-09-30 22:12:51.396594", + "step": 2077, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:51.456724", + "step": 2077, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012718225829303265, + "timestamp": "2025-09-30 22:12:51.459467", + "step": 2078, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.520522", + "step": 2078, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02810058370232582, + "timestamp": "2025-09-30 22:12:51.523666", + "step": 2079, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.578431", + "step": 2079, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00501110078766942, + "timestamp": "2025-09-30 22:12:51.585017", + "step": 2080, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:51.640545", + "step": 2080, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015446553006768227, + "timestamp": "2025-09-30 22:12:51.649823", + "step": 2081, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.706135", + "step": 2081, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006646531168371439, + "timestamp": "2025-09-30 22:12:51.709919", + "step": 2082, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.770248", + "step": 2082, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007312596309930086, + "timestamp": "2025-09-30 22:12:51.773296", + "step": 2083, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:51.834524", + "step": 2083, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001499996636994183, + "timestamp": "2025-09-30 22:12:51.847405", + "step": 2084, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.901463", + "step": 2084, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03770451620221138, + "timestamp": "2025-09-30 22:12:51.904924", + "step": 2085, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:51.961949", + "step": 2085, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013079524040222168, + "timestamp": "2025-09-30 22:12:51.965128", + "step": 2086, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:52.023658", + "step": 2086, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007665101904422045, + "timestamp": "2025-09-30 22:12:52.026475", + "step": 2087, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:52.099632", + "step": 2087, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014709694311022758, + "timestamp": "2025-09-30 22:12:52.114550", + "step": 2088, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:52.174855", + "step": 2088, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035318650770932436, + "timestamp": "2025-09-30 22:12:52.178697", + "step": 2089, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:52.242420", + "step": 2089, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026837512850761414, + "timestamp": "2025-09-30 22:12:52.245902", + "step": 2090, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.303214", + "step": 2090, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003676437307149172, + "timestamp": "2025-09-30 22:12:52.307125", + "step": 2091, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.366135", + "step": 2091, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022839205339550972, + "timestamp": "2025-09-30 22:12:52.375466", + "step": 2092, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.432074", + "step": 2092, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020653358660638332, + "timestamp": "2025-09-30 22:12:52.435305", + "step": 2093, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:52.496909", + "step": 2093, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010991484858095646, + "timestamp": "2025-09-30 22:12:52.500189", + "step": 2094, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.560748", + "step": 2094, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005481290630996227, + "timestamp": "2025-09-30 22:12:52.565143", + "step": 2095, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.620407", + "step": 2095, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04029859974980354, + "timestamp": "2025-09-30 22:12:52.627397", + "step": 2096, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:52.684592", + "step": 2096, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0042072138749063015, + "timestamp": "2025-09-30 22:12:52.689409", + "step": 2097, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:52.747358", + "step": 2097, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.049394670873880386, + "timestamp": "2025-09-30 22:12:52.751278", + "step": 2098, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.806129", + "step": 2098, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020421581342816353, + "timestamp": "2025-09-30 22:12:52.809621", + "step": 2099, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:52.865837", + "step": 2099, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010594218969345093, + "timestamp": "2025-09-30 22:12:52.872798", + "step": 2100, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:52.926620", + "step": 2100, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015280555235221982, + "timestamp": "2025-09-30 22:12:52.935170", + "step": 2101, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:52.990431", + "step": 2101, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0070367841981351376, + "timestamp": "2025-09-30 22:12:52.993310", + "step": 2102, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:53.050292", + "step": 2102, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030247675254940987, + "timestamp": "2025-09-30 22:12:53.053333", + "step": 2103, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:53.116406", + "step": 2103, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010143903782591224, + "timestamp": "2025-09-30 22:12:53.123065", + "step": 2104, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:53.176416", + "step": 2104, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0039184377528727055, + "timestamp": "2025-09-30 22:12:53.179306", + "step": 2105, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:53.235365", + "step": 2105, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007116036955267191, + "timestamp": "2025-09-30 22:12:53.238300", + "step": 2106, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:53.293136", + "step": 2106, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02538345754146576, + "timestamp": "2025-09-30 22:12:53.296719", + "step": 2107, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:53.358073", + "step": 2107, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007159593049436808, + "timestamp": "2025-09-30 22:12:53.377858", + "step": 2108, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:53.433335", + "step": 2108, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012715340591967106, + "timestamp": "2025-09-30 22:12:53.435844", + "step": 2109, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:12:54.807881", + "step": 2109, + "epoch": 3 + }, + { + "type": "pplx", + "content": 31717602.29605612, + "timestamp": "2025-09-30 22:12:54.810448", + "step": 2109, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:54.866666", + "step": 2109, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0069103785790503025, + "timestamp": "2025-09-30 22:12:54.870626", + "step": 2110, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:54.925774", + "step": 2110, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004411868576426059, + "timestamp": "2025-09-30 22:12:54.928538", + "step": 2111, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:54.986256", + "step": 2111, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02559594437479973, + "timestamp": "2025-09-30 22:12:54.994525", + "step": 2112, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:55.056307", + "step": 2112, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006536331493407488, + "timestamp": "2025-09-30 22:12:55.061762", + "step": 2113, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:55.119574", + "step": 2113, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.034199222922325134, + "timestamp": "2025-09-30 22:12:55.121736", + "step": 2114, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.184055", + "step": 2114, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037060887552797794, + "timestamp": "2025-09-30 22:12:55.190340", + "step": 2115, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.246418", + "step": 2115, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010455816984176636, + "timestamp": "2025-09-30 22:12:55.257495", + "step": 2116, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.323363", + "step": 2116, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009131490252912045, + "timestamp": "2025-09-30 22:12:55.329740", + "step": 2117, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:55.385535", + "step": 2117, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028637733310461044, + "timestamp": "2025-09-30 22:12:55.388119", + "step": 2118, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.462086", + "step": 2118, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017552494537085295, + "timestamp": "2025-09-30 22:12:55.464428", + "step": 2119, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.524997", + "step": 2119, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008230188977904618, + "timestamp": "2025-09-30 22:12:55.532668", + "step": 2120, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:55.586377", + "step": 2120, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001255502225831151, + "timestamp": "2025-09-30 22:12:55.599866", + "step": 2121, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:55.687367", + "step": 2121, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03605801239609718, + "timestamp": "2025-09-30 22:12:55.689607", + "step": 2122, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.780220", + "step": 2122, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02137705311179161, + "timestamp": "2025-09-30 22:12:55.784688", + "step": 2123, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.876184", + "step": 2123, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00452017318457365, + "timestamp": "2025-09-30 22:12:55.885171", + "step": 2124, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:55.977205", + "step": 2124, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022074688225984573, + "timestamp": "2025-09-30 22:12:55.979429", + "step": 2125, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:56.069986", + "step": 2125, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01156954187899828, + "timestamp": "2025-09-30 22:12:56.072411", + "step": 2126, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.160302", + "step": 2126, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01876743696630001, + "timestamp": "2025-09-30 22:12:56.162722", + "step": 2127, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.258051", + "step": 2127, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018221214413642883, + "timestamp": "2025-09-30 22:12:56.268989", + "step": 2128, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.343166", + "step": 2128, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011680861935019493, + "timestamp": "2025-09-30 22:12:56.345766", + "step": 2129, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:56.427254", + "step": 2129, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007095720618963242, + "timestamp": "2025-09-30 22:12:56.431297", + "step": 2130, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.498838", + "step": 2130, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004647306632250547, + "timestamp": "2025-09-30 22:12:56.501718", + "step": 2131, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.576126", + "step": 2131, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012296868488192558, + "timestamp": "2025-09-30 22:12:56.582860", + "step": 2132, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:56.659773", + "step": 2132, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03337612748146057, + "timestamp": "2025-09-30 22:12:56.663029", + "step": 2133, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:56.744957", + "step": 2133, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00937966164201498, + "timestamp": "2025-09-30 22:12:56.748135", + "step": 2134, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.821695", + "step": 2134, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002799929352477193, + "timestamp": "2025-09-30 22:12:56.825877", + "step": 2135, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:56.906853", + "step": 2135, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0039896611124277115, + "timestamp": "2025-09-30 22:12:56.913216", + "step": 2136, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:56.988658", + "step": 2136, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00480355229228735, + "timestamp": "2025-09-30 22:12:56.998934", + "step": 2137, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.075916", + "step": 2137, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006306238938122988, + "timestamp": "2025-09-30 22:12:57.078555", + "step": 2138, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.158792", + "step": 2138, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029865510296076536, + "timestamp": "2025-09-30 22:12:57.166147", + "step": 2139, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.241108", + "step": 2139, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00501589709892869, + "timestamp": "2025-09-30 22:12:57.252081", + "step": 2140, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.309098", + "step": 2140, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0062422240152955055, + "timestamp": "2025-09-30 22:12:57.311877", + "step": 2141, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.370622", + "step": 2141, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014008880592882633, + "timestamp": "2025-09-30 22:12:57.373707", + "step": 2142, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.446870", + "step": 2142, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016020223265513778, + "timestamp": "2025-09-30 22:12:57.450732", + "step": 2143, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.506223", + "step": 2143, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007073293440043926, + "timestamp": "2025-09-30 22:12:57.520776", + "step": 2144, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.581482", + "step": 2144, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006945033557713032, + "timestamp": "2025-09-30 22:12:57.585496", + "step": 2145, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.640324", + "step": 2145, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017107333987951279, + "timestamp": "2025-09-30 22:12:57.642964", + "step": 2146, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.697675", + "step": 2146, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020304743200540543, + "timestamp": "2025-09-30 22:12:57.701181", + "step": 2147, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.755963", + "step": 2147, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004924300592392683, + "timestamp": "2025-09-30 22:12:57.762052", + "step": 2148, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.825982", + "step": 2148, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008847953751683235, + "timestamp": "2025-09-30 22:12:57.828336", + "step": 2149, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.895714", + "step": 2149, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031908315140753984, + "timestamp": "2025-09-30 22:12:57.898745", + "step": 2150, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:57.954645", + "step": 2150, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009887597523629665, + "timestamp": "2025-09-30 22:12:57.957892", + "step": 2151, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:58.016821", + "step": 2151, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016261015087366104, + "timestamp": "2025-09-30 22:12:58.023858", + "step": 2152, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.082714", + "step": 2152, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011811012402176857, + "timestamp": "2025-09-30 22:12:58.086341", + "step": 2153, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:58.143616", + "step": 2153, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006635370198637247, + "timestamp": "2025-09-30 22:12:58.146935", + "step": 2154, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:58.203868", + "step": 2154, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009925504215061665, + "timestamp": "2025-09-30 22:12:58.206731", + "step": 2155, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.276892", + "step": 2155, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01277694571763277, + "timestamp": "2025-09-30 22:12:58.284433", + "step": 2156, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.341811", + "step": 2156, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019755104556679726, + "timestamp": "2025-09-30 22:12:58.345051", + "step": 2157, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:58.401533", + "step": 2157, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005314360372722149, + "timestamp": "2025-09-30 22:12:58.404741", + "step": 2158, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:58.462362", + "step": 2158, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008464450016617775, + "timestamp": "2025-09-30 22:12:58.465423", + "step": 2159, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.536232", + "step": 2159, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02357994019985199, + "timestamp": "2025-09-30 22:12:58.543507", + "step": 2160, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.602367", + "step": 2160, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015320166014134884, + "timestamp": "2025-09-30 22:12:58.609775", + "step": 2161, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:12:58.664326", + "step": 2161, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026179542765021324, + "timestamp": "2025-09-30 22:12:58.667705", + "step": 2162, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:12:58.728789", + "step": 2162, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021009227260947227, + "timestamp": "2025-09-30 22:12:58.732799", + "step": 2163, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.795597", + "step": 2163, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019618257880210876, + "timestamp": "2025-09-30 22:12:58.801753", + "step": 2164, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:12:58.859468", + "step": 2164, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0046376558020710945, + "timestamp": "2025-09-30 22:12:58.863573", + "step": 2165, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:12:58.919648", + "step": 2165, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013165593147277832, + "timestamp": "2025-09-30 22:12:58.934748", + "step": 2166, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:00.344251", + "step": 2166, + "epoch": 3 + }, + { + "type": "pplx", + "content": 28392377.416443832, + "timestamp": "2025-09-30 22:13:00.348523", + "step": 2166, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:00.405249", + "step": 2166, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01954605057835579, + "timestamp": "2025-09-30 22:13:00.413299", + "step": 2167, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:00.470031", + "step": 2167, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022829841822385788, + "timestamp": "2025-09-30 22:13:00.484098", + "step": 2168, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:00.539409", + "step": 2168, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007014808710664511, + "timestamp": "2025-09-30 22:13:00.541868", + "step": 2169, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:00.603810", + "step": 2169, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007217395468614995, + "timestamp": "2025-09-30 22:13:00.609050", + "step": 2170, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:00.668024", + "step": 2170, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036041310522705317, + "timestamp": "2025-09-30 22:13:00.671095", + "step": 2171, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:00.732930", + "step": 2171, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017051756149157882, + "timestamp": "2025-09-30 22:13:00.740229", + "step": 2172, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:00.802077", + "step": 2172, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01299409568309784, + "timestamp": "2025-09-30 22:13:00.806365", + "step": 2173, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:00.860598", + "step": 2173, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008998853154480457, + "timestamp": "2025-09-30 22:13:00.868540", + "step": 2174, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:00.923574", + "step": 2174, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022543568164110184, + "timestamp": "2025-09-30 22:13:00.933624", + "step": 2175, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:00.995139", + "step": 2175, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00293608452193439, + "timestamp": "2025-09-30 22:13:01.001532", + "step": 2176, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.063423", + "step": 2176, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007718019187450409, + "timestamp": "2025-09-30 22:13:01.077359", + "step": 2177, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.139374", + "step": 2177, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05061681196093559, + "timestamp": "2025-09-30 22:13:01.142240", + "step": 2178, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.205104", + "step": 2178, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036929224152117968, + "timestamp": "2025-09-30 22:13:01.209922", + "step": 2179, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:01.274566", + "step": 2179, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010988332331180573, + "timestamp": "2025-09-30 22:13:01.282766", + "step": 2180, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:01.339141", + "step": 2180, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006843825336545706, + "timestamp": "2025-09-30 22:13:01.345966", + "step": 2181, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.404084", + "step": 2181, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031122539658099413, + "timestamp": "2025-09-30 22:13:01.410275", + "step": 2182, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:01.467236", + "step": 2182, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012426759116351604, + "timestamp": "2025-09-30 22:13:01.470013", + "step": 2183, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.527816", + "step": 2183, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011462002992630005, + "timestamp": "2025-09-30 22:13:01.537702", + "step": 2184, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.603326", + "step": 2184, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013611538335680962, + "timestamp": "2025-09-30 22:13:01.605682", + "step": 2185, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.662218", + "step": 2185, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00627383217215538, + "timestamp": "2025-09-30 22:13:01.666345", + "step": 2186, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.723862", + "step": 2186, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016363272443413734, + "timestamp": "2025-09-30 22:13:01.728741", + "step": 2187, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:01.787447", + "step": 2187, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01059199869632721, + "timestamp": "2025-09-30 22:13:01.795897", + "step": 2188, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.851716", + "step": 2188, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007782274391502142, + "timestamp": "2025-09-30 22:13:01.854373", + "step": 2189, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:01.909906", + "step": 2189, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004535790532827377, + "timestamp": "2025-09-30 22:13:01.919337", + "step": 2190, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:01.983918", + "step": 2190, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028639093041419983, + "timestamp": "2025-09-30 22:13:01.992130", + "step": 2191, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.047436", + "step": 2191, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020749655086547136, + "timestamp": "2025-09-30 22:13:02.053529", + "step": 2192, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.114527", + "step": 2192, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004414039198309183, + "timestamp": "2025-09-30 22:13:02.120142", + "step": 2193, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:02.181647", + "step": 2193, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022147908806800842, + "timestamp": "2025-09-30 22:13:02.193109", + "step": 2194, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:02.261747", + "step": 2194, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016150318551808596, + "timestamp": "2025-09-30 22:13:02.274714", + "step": 2195, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.342578", + "step": 2195, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04595841094851494, + "timestamp": "2025-09-30 22:13:02.361835", + "step": 2196, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:02.421979", + "step": 2196, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02536730282008648, + "timestamp": "2025-09-30 22:13:02.426233", + "step": 2197, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.480360", + "step": 2197, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0070744892582297325, + "timestamp": "2025-09-30 22:13:02.489001", + "step": 2198, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.544743", + "step": 2198, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005618877708911896, + "timestamp": "2025-09-30 22:13:02.547146", + "step": 2199, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:02.606597", + "step": 2199, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013747888151556253, + "timestamp": "2025-09-30 22:13:02.615649", + "step": 2200, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:02.670306", + "step": 2200, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00427739042788744, + "timestamp": "2025-09-30 22:13:02.672784", + "step": 2201, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.749819", + "step": 2201, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012922806665301323, + "timestamp": "2025-09-30 22:13:02.752972", + "step": 2202, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.811834", + "step": 2202, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016593519831076264, + "timestamp": "2025-09-30 22:13:02.817260", + "step": 2203, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:02.872153", + "step": 2203, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003395694075152278, + "timestamp": "2025-09-30 22:13:02.881138", + "step": 2204, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.939609", + "step": 2204, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013927072286605835, + "timestamp": "2025-09-30 22:13:02.942410", + "step": 2205, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:02.998936", + "step": 2205, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017614727839827538, + "timestamp": "2025-09-30 22:13:03.008428", + "step": 2206, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:03.080214", + "step": 2206, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028198177460581064, + "timestamp": "2025-09-30 22:13:03.083580", + "step": 2207, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.148948", + "step": 2207, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02004883624613285, + "timestamp": "2025-09-30 22:13:03.156280", + "step": 2208, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.215571", + "step": 2208, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010212014429271221, + "timestamp": "2025-09-30 22:13:03.221426", + "step": 2209, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.276640", + "step": 2209, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002033967524766922, + "timestamp": "2025-09-30 22:13:03.283455", + "step": 2210, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.343725", + "step": 2210, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013138767099007964, + "timestamp": "2025-09-30 22:13:03.347390", + "step": 2211, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.413671", + "step": 2211, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013939599739387631, + "timestamp": "2025-09-30 22:13:03.432049", + "step": 2212, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.486732", + "step": 2212, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024434858933091164, + "timestamp": "2025-09-30 22:13:03.489344", + "step": 2213, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:03.553859", + "step": 2213, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008440472185611725, + "timestamp": "2025-09-30 22:13:03.563935", + "step": 2214, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.624857", + "step": 2214, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01175626926124096, + "timestamp": "2025-09-30 22:13:03.628668", + "step": 2215, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:03.696209", + "step": 2215, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018027642741799355, + "timestamp": "2025-09-30 22:13:03.706990", + "step": 2216, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.764429", + "step": 2216, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008521536365151405, + "timestamp": "2025-09-30 22:13:03.772218", + "step": 2217, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:03.836575", + "step": 2217, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008888277225196362, + "timestamp": "2025-09-30 22:13:03.840283", + "step": 2218, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:03.902344", + "step": 2218, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012383210705593228, + "timestamp": "2025-09-30 22:13:03.910926", + "step": 2219, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:03.977533", + "step": 2219, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0042413403280079365, + "timestamp": "2025-09-30 22:13:03.984542", + "step": 2220, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:04.040130", + "step": 2220, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010714359814301133, + "timestamp": "2025-09-30 22:13:04.042791", + "step": 2221, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:04.107674", + "step": 2221, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006664037238806486, + "timestamp": "2025-09-30 22:13:04.111000", + "step": 2222, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:04.170928", + "step": 2222, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031583583913743496, + "timestamp": "2025-09-30 22:13:04.173623", + "step": 2223, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:05.490048", + "step": 2223, + "epoch": 3 + }, + { + "type": "pplx", + "content": 29344370.172395393, + "timestamp": "2025-09-30 22:13:05.492733", + "step": 2223, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.546368", + "step": 2223, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006295633502304554, + "timestamp": "2025-09-30 22:13:05.552730", + "step": 2224, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.607930", + "step": 2224, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003405241295695305, + "timestamp": "2025-09-30 22:13:05.610697", + "step": 2225, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.667829", + "step": 2225, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002446610014885664, + "timestamp": "2025-09-30 22:13:05.673988", + "step": 2226, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.729752", + "step": 2226, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017070619389414787, + "timestamp": "2025-09-30 22:13:05.733291", + "step": 2227, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:05.789448", + "step": 2227, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01944059133529663, + "timestamp": "2025-09-30 22:13:05.796101", + "step": 2228, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.852039", + "step": 2228, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016683695139363408, + "timestamp": "2025-09-30 22:13:05.857473", + "step": 2229, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.925067", + "step": 2229, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019555925391614437, + "timestamp": "2025-09-30 22:13:05.929845", + "step": 2230, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:05.989548", + "step": 2230, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00015620069461874664, + "timestamp": "2025-09-30 22:13:05.991628", + "step": 2231, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:06.052663", + "step": 2231, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01360904611647129, + "timestamp": "2025-09-30 22:13:06.063942", + "step": 2232, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.127300", + "step": 2232, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014074391219764948, + "timestamp": "2025-09-30 22:13:06.132756", + "step": 2233, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.196856", + "step": 2233, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002081622602418065, + "timestamp": "2025-09-30 22:13:06.204891", + "step": 2234, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.262721", + "step": 2234, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013570208102464676, + "timestamp": "2025-09-30 22:13:06.273367", + "step": 2235, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:06.341281", + "step": 2235, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0063094040378928185, + "timestamp": "2025-09-30 22:13:06.353285", + "step": 2236, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:06.425103", + "step": 2236, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011793047888204455, + "timestamp": "2025-09-30 22:13:06.432018", + "step": 2237, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.494162", + "step": 2237, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024860413745045662, + "timestamp": "2025-09-30 22:13:06.499047", + "step": 2238, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:06.571763", + "step": 2238, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029842732474207878, + "timestamp": "2025-09-30 22:13:06.581059", + "step": 2239, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:06.652497", + "step": 2239, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033206476364284754, + "timestamp": "2025-09-30 22:13:06.664027", + "step": 2240, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.725036", + "step": 2240, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032124409917742014, + "timestamp": "2025-09-30 22:13:06.729256", + "step": 2241, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:06.797711", + "step": 2241, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014608422061428428, + "timestamp": "2025-09-30 22:13:06.820254", + "step": 2242, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.893327", + "step": 2242, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015411525964736938, + "timestamp": "2025-09-30 22:13:06.909198", + "step": 2243, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:06.972246", + "step": 2243, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00187637226190418, + "timestamp": "2025-09-30 22:13:06.983547", + "step": 2244, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:07.038825", + "step": 2244, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0047594718635082245, + "timestamp": "2025-09-30 22:13:07.045563", + "step": 2245, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:07.105710", + "step": 2245, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003375373315066099, + "timestamp": "2025-09-30 22:13:07.108166", + "step": 2246, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.162366", + "step": 2246, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013589292066171765, + "timestamp": "2025-09-30 22:13:07.165331", + "step": 2247, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.220888", + "step": 2247, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016323648393154144, + "timestamp": "2025-09-30 22:13:07.229246", + "step": 2248, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.283426", + "step": 2248, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006245364900678396, + "timestamp": "2025-09-30 22:13:07.292940", + "step": 2249, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.349128", + "step": 2249, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017451593885198236, + "timestamp": "2025-09-30 22:13:07.352875", + "step": 2250, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.412427", + "step": 2250, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007775729056447744, + "timestamp": "2025-09-30 22:13:07.423227", + "step": 2251, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.480814", + "step": 2251, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02229670248925686, + "timestamp": "2025-09-30 22:13:07.491348", + "step": 2252, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.553274", + "step": 2252, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005842237151227891, + "timestamp": "2025-09-30 22:13:07.555760", + "step": 2253, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:07.624606", + "step": 2253, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024967549834400415, + "timestamp": "2025-09-30 22:13:07.627473", + "step": 2254, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:07.682185", + "step": 2254, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00876485276967287, + "timestamp": "2025-09-30 22:13:07.685725", + "step": 2255, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:07.748598", + "step": 2255, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00036741181975230575, + "timestamp": "2025-09-30 22:13:07.754726", + "step": 2256, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:07.814528", + "step": 2256, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012112419353798032, + "timestamp": "2025-09-30 22:13:07.818293", + "step": 2257, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:07.892341", + "step": 2257, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0117741534486413, + "timestamp": "2025-09-30 22:13:07.898564", + "step": 2258, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:07.956914", + "step": 2258, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001920665497891605, + "timestamp": "2025-09-30 22:13:07.968074", + "step": 2259, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:08.024271", + "step": 2259, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017800560453906655, + "timestamp": "2025-09-30 22:13:08.032844", + "step": 2260, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.087032", + "step": 2260, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007017010357230902, + "timestamp": "2025-09-30 22:13:08.089549", + "step": 2261, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.146410", + "step": 2261, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003600542258936912, + "timestamp": "2025-09-30 22:13:08.154264", + "step": 2262, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.211686", + "step": 2262, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009401796385645866, + "timestamp": "2025-09-30 22:13:08.214506", + "step": 2263, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:08.270432", + "step": 2263, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004206747282296419, + "timestamp": "2025-09-30 22:13:08.281054", + "step": 2264, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.339524", + "step": 2264, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00021178685710765421, + "timestamp": "2025-09-30 22:13:08.341888", + "step": 2265, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:08.397581", + "step": 2265, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00048541155410930514, + "timestamp": "2025-09-30 22:13:08.402602", + "step": 2266, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.458086", + "step": 2266, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003917259629815817, + "timestamp": "2025-09-30 22:13:08.464337", + "step": 2267, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:08.523381", + "step": 2267, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003979508299380541, + "timestamp": "2025-09-30 22:13:08.530734", + "step": 2268, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.597171", + "step": 2268, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003099239547736943, + "timestamp": "2025-09-30 22:13:08.599264", + "step": 2269, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.652680", + "step": 2269, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000424488156568259, + "timestamp": "2025-09-30 22:13:08.662711", + "step": 2270, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.715873", + "step": 2270, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007150634657591581, + "timestamp": "2025-09-30 22:13:08.718812", + "step": 2271, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.783543", + "step": 2271, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024335901252925396, + "timestamp": "2025-09-30 22:13:08.790618", + "step": 2272, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.846206", + "step": 2272, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01159227080643177, + "timestamp": "2025-09-30 22:13:08.849451", + "step": 2273, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.917310", + "step": 2273, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005326881073415279, + "timestamp": "2025-09-30 22:13:08.926533", + "step": 2274, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:08.983661", + "step": 2274, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012480862205848098, + "timestamp": "2025-09-30 22:13:08.992832", + "step": 2275, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:09.050359", + "step": 2275, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008051712065935135, + "timestamp": "2025-09-30 22:13:09.056973", + "step": 2276, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:09.120010", + "step": 2276, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012699085054919124, + "timestamp": "2025-09-30 22:13:09.125755", + "step": 2277, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:09.180682", + "step": 2277, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006048406939953566, + "timestamp": "2025-09-30 22:13:09.183726", + "step": 2278, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:09.250575", + "step": 2278, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004453370813280344, + "timestamp": "2025-09-30 22:13:09.253346", + "step": 2279, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:09.308955", + "step": 2279, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002166403690353036, + "timestamp": "2025-09-30 22:13:09.315558", + "step": 2280, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:10.738278", + "step": 2280, + "epoch": 3 + }, + { + "type": "pplx", + "content": 31190960.103936635, + "timestamp": "2025-09-30 22:13:10.740643", + "step": 2280, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:10.792576", + "step": 2280, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00041843479266390204, + "timestamp": "2025-09-30 22:13:10.795117", + "step": 2281, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:10.859013", + "step": 2281, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006251118145883083, + "timestamp": "2025-09-30 22:13:10.862684", + "step": 2282, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:10.926799", + "step": 2282, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009974894346669316, + "timestamp": "2025-09-30 22:13:10.929266", + "step": 2283, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:10.989116", + "step": 2283, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032588716130703688, + "timestamp": "2025-09-30 22:13:10.997818", + "step": 2284, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.061570", + "step": 2284, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005364909302443266, + "timestamp": "2025-09-30 22:13:11.064019", + "step": 2285, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.122069", + "step": 2285, + "epoch": 3 + }, + { + "type": "loss", + "content": 8.350759890163317e-05, + "timestamp": "2025-09-30 22:13:11.124572", + "step": 2286, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.190995", + "step": 2286, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0043342201970517635, + "timestamp": "2025-09-30 22:13:11.195044", + "step": 2287, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:11.268671", + "step": 2287, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011432368773967028, + "timestamp": "2025-09-30 22:13:11.275236", + "step": 2288, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:11.339399", + "step": 2288, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010339574655517936, + "timestamp": "2025-09-30 22:13:11.342362", + "step": 2289, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.403531", + "step": 2289, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0022677092347294092, + "timestamp": "2025-09-30 22:13:11.409800", + "step": 2290, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.472241", + "step": 2290, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004064720589667559, + "timestamp": "2025-09-30 22:13:11.474953", + "step": 2291, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:11.536884", + "step": 2291, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027135172858834267, + "timestamp": "2025-09-30 22:13:11.543184", + "step": 2292, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.602293", + "step": 2292, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008837772184051573, + "timestamp": "2025-09-30 22:13:11.608386", + "step": 2293, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:11.665675", + "step": 2293, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011004587635397911, + "timestamp": "2025-09-30 22:13:11.668035", + "step": 2294, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.727743", + "step": 2294, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004693236667662859, + "timestamp": "2025-09-30 22:13:11.730020", + "step": 2295, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.788422", + "step": 2295, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006029639625921845, + "timestamp": "2025-09-30 22:13:11.798640", + "step": 2296, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.854358", + "step": 2296, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001073610968887806, + "timestamp": "2025-09-30 22:13:11.856714", + "step": 2297, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.915935", + "step": 2297, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027264878153800964, + "timestamp": "2025-09-30 22:13:11.920656", + "step": 2298, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:11.980816", + "step": 2298, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007334630936384201, + "timestamp": "2025-09-30 22:13:11.985247", + "step": 2299, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.059254", + "step": 2299, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011301154736429453, + "timestamp": "2025-09-30 22:13:12.065752", + "step": 2300, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.123890", + "step": 2300, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00015225273091346025, + "timestamp": "2025-09-30 22:13:12.128224", + "step": 2301, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.196012", + "step": 2301, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000535293947905302, + "timestamp": "2025-09-30 22:13:12.198299", + "step": 2302, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.252848", + "step": 2302, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004955708514899015, + "timestamp": "2025-09-30 22:13:12.260839", + "step": 2303, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.319753", + "step": 2303, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00846810918301344, + "timestamp": "2025-09-30 22:13:12.328509", + "step": 2304, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.383083", + "step": 2304, + "epoch": 3 + }, + { + "type": "loss", + "content": 6.449552165577188e-05, + "timestamp": "2025-09-30 22:13:12.388975", + "step": 2305, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.465214", + "step": 2305, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004094495438039303, + "timestamp": "2025-09-30 22:13:12.468338", + "step": 2306, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:12.524076", + "step": 2306, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0043173558078706264, + "timestamp": "2025-09-30 22:13:12.531143", + "step": 2307, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:12.586832", + "step": 2307, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013621537014842033, + "timestamp": "2025-09-30 22:13:12.593105", + "step": 2308, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:12.647208", + "step": 2308, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00045176432467997074, + "timestamp": "2025-09-30 22:13:12.651462", + "step": 2309, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.706835", + "step": 2309, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00153114995919168, + "timestamp": "2025-09-30 22:13:12.709825", + "step": 2310, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.768571", + "step": 2310, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002749506966210902, + "timestamp": "2025-09-30 22:13:12.773973", + "step": 2311, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.830788", + "step": 2311, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001482378429500386, + "timestamp": "2025-09-30 22:13:12.836862", + "step": 2312, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.899423", + "step": 2312, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001635663356864825, + "timestamp": "2025-09-30 22:13:12.905845", + "step": 2313, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:12.972178", + "step": 2313, + "epoch": 3 + }, + { + "type": "loss", + "content": 2.553769627411384e-05, + "timestamp": "2025-09-30 22:13:12.974513", + "step": 2314, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.030917", + "step": 2314, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014685734175145626, + "timestamp": "2025-09-30 22:13:13.033978", + "step": 2315, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.092800", + "step": 2315, + "epoch": 3 + }, + { + "type": "loss", + "content": 7.750854274490848e-05, + "timestamp": "2025-09-30 22:13:13.105173", + "step": 2316, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:13.160613", + "step": 2316, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010388302616775036, + "timestamp": "2025-09-30 22:13:13.164054", + "step": 2317, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.231936", + "step": 2317, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024059859570115805, + "timestamp": "2025-09-30 22:13:13.239512", + "step": 2318, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:13.295025", + "step": 2318, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00040224703843705356, + "timestamp": "2025-09-30 22:13:13.304151", + "step": 2319, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.366702", + "step": 2319, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000600070517975837, + "timestamp": "2025-09-30 22:13:13.372841", + "step": 2320, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:13.428888", + "step": 2320, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.043553676456213, + "timestamp": "2025-09-30 22:13:13.432384", + "step": 2321, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.495898", + "step": 2321, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01901986077427864, + "timestamp": "2025-09-30 22:13:13.498854", + "step": 2322, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.566548", + "step": 2322, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020047681406140327, + "timestamp": "2025-09-30 22:13:13.569703", + "step": 2323, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:13.625179", + "step": 2323, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00016043984214775264, + "timestamp": "2025-09-30 22:13:13.632855", + "step": 2324, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.696114", + "step": 2324, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012856409884989262, + "timestamp": "2025-09-30 22:13:13.699713", + "step": 2325, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:13.762802", + "step": 2325, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004349694121629, + "timestamp": "2025-09-30 22:13:13.765900", + "step": 2326, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.825731", + "step": 2326, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007128869765438139, + "timestamp": "2025-09-30 22:13:13.829485", + "step": 2327, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.886792", + "step": 2327, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006236379034817219, + "timestamp": "2025-09-30 22:13:13.901117", + "step": 2328, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:13.960939", + "step": 2328, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003488468355499208, + "timestamp": "2025-09-30 22:13:13.964455", + "step": 2329, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:14.036143", + "step": 2329, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00010532321175560355, + "timestamp": "2025-09-30 22:13:14.039724", + "step": 2330, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:14.099722", + "step": 2330, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03490881994366646, + "timestamp": "2025-09-30 22:13:14.103491", + "step": 2331, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:14.168611", + "step": 2331, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013101531192660332, + "timestamp": "2025-09-30 22:13:14.179820", + "step": 2332, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:14.240568", + "step": 2332, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005494463839568198, + "timestamp": "2025-09-30 22:13:14.244365", + "step": 2333, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:14.304060", + "step": 2333, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002779501664917916, + "timestamp": "2025-09-30 22:13:14.315947", + "step": 2334, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:14.375221", + "step": 2334, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017243318725377321, + "timestamp": "2025-09-30 22:13:14.378038", + "step": 2335, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:14.440264", + "step": 2335, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026360393967479467, + "timestamp": "2025-09-30 22:13:14.452840", + "step": 2336, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:14.513305", + "step": 2336, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0022121057845652103, + "timestamp": "2025-09-30 22:13:14.521054", + "step": 2337, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:15.906861", + "step": 2337, + "epoch": 3 + }, + { + "type": "pplx", + "content": 35685924.762230136, + "timestamp": "2025-09-30 22:13:15.911272", + "step": 2337, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:15.969547", + "step": 2337, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001224317093146965, + "timestamp": "2025-09-30 22:13:15.973320", + "step": 2338, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.041128", + "step": 2338, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006064993212930858, + "timestamp": "2025-09-30 22:13:16.044973", + "step": 2339, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.100730", + "step": 2339, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003981334622949362, + "timestamp": "2025-09-30 22:13:16.107765", + "step": 2340, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:16.166753", + "step": 2340, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00029052264289930463, + "timestamp": "2025-09-30 22:13:16.171192", + "step": 2341, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:16.226872", + "step": 2341, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010386792942881584, + "timestamp": "2025-09-30 22:13:16.230245", + "step": 2342, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.294054", + "step": 2342, + "epoch": 3 + }, + { + "type": "loss", + "content": 5.578704076469876e-05, + "timestamp": "2025-09-30 22:13:16.297024", + "step": 2343, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.356213", + "step": 2343, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007381403702311218, + "timestamp": "2025-09-30 22:13:16.369322", + "step": 2344, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.426141", + "step": 2344, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03852478787302971, + "timestamp": "2025-09-30 22:13:16.430172", + "step": 2345, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:16.489012", + "step": 2345, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006424127495847642, + "timestamp": "2025-09-30 22:13:16.499829", + "step": 2346, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.556480", + "step": 2346, + "epoch": 3 + }, + { + "type": "loss", + "content": 9.567866800352931e-05, + "timestamp": "2025-09-30 22:13:16.560414", + "step": 2347, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.615496", + "step": 2347, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009399794973433018, + "timestamp": "2025-09-30 22:13:16.630284", + "step": 2348, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.693055", + "step": 2348, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003211422299500555, + "timestamp": "2025-09-30 22:13:16.697092", + "step": 2349, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:16.757223", + "step": 2349, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036444012075662613, + "timestamp": "2025-09-30 22:13:16.762122", + "step": 2350, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:16.821943", + "step": 2350, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026028577238321304, + "timestamp": "2025-09-30 22:13:16.832804", + "step": 2351, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.901109", + "step": 2351, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0048171780072152615, + "timestamp": "2025-09-30 22:13:16.908059", + "step": 2352, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:16.962841", + "step": 2352, + "epoch": 3 + }, + { + "type": "loss", + "content": 3.750595715246163e-05, + "timestamp": "2025-09-30 22:13:16.965327", + "step": 2353, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:17.025383", + "step": 2353, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008849648758769035, + "timestamp": "2025-09-30 22:13:17.028464", + "step": 2354, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:17.092237", + "step": 2354, + "epoch": 3 + }, + { + "type": "loss", + "content": 6.525561184389517e-05, + "timestamp": "2025-09-30 22:13:17.100739", + "step": 2355, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.163544", + "step": 2355, + "epoch": 3 + }, + { + "type": "loss", + "content": 9.634840534999967e-05, + "timestamp": "2025-09-30 22:13:17.171199", + "step": 2356, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.228897", + "step": 2356, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001659039407968521, + "timestamp": "2025-09-30 22:13:17.233707", + "step": 2357, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:17.301413", + "step": 2357, + "epoch": 3 + }, + { + "type": "loss", + "content": 8.300376066472381e-05, + "timestamp": "2025-09-30 22:13:17.306270", + "step": 2358, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.365873", + "step": 2358, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006306396098807454, + "timestamp": "2025-09-30 22:13:17.369232", + "step": 2359, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.436726", + "step": 2359, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012602239847183228, + "timestamp": "2025-09-30 22:13:17.443521", + "step": 2360, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:17.504255", + "step": 2360, + "epoch": 3 + }, + { + "type": "loss", + "content": 6.385560845956206e-05, + "timestamp": "2025-09-30 22:13:17.513534", + "step": 2361, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.570422", + "step": 2361, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011304274667054415, + "timestamp": "2025-09-30 22:13:17.573485", + "step": 2362, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.631234", + "step": 2362, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038645591121166945, + "timestamp": "2025-09-30 22:13:17.634878", + "step": 2363, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.691433", + "step": 2363, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007124242372810841, + "timestamp": "2025-09-30 22:13:17.704027", + "step": 2364, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.764565", + "step": 2364, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018359085079282522, + "timestamp": "2025-09-30 22:13:17.773024", + "step": 2365, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.833422", + "step": 2365, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014947568997740746, + "timestamp": "2025-09-30 22:13:17.835913", + "step": 2366, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:17.893406", + "step": 2366, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00046386828762479126, + "timestamp": "2025-09-30 22:13:17.896906", + "step": 2367, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:17.960708", + "step": 2367, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008881064131855965, + "timestamp": "2025-09-30 22:13:17.968071", + "step": 2368, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.023659", + "step": 2368, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001027476173476316, + "timestamp": "2025-09-30 22:13:18.026536", + "step": 2369, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.086101", + "step": 2369, + "epoch": 3 + }, + { + "type": "loss", + "content": 5.726577728637494e-05, + "timestamp": "2025-09-30 22:13:18.089037", + "step": 2370, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.152428", + "step": 2370, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024566694628447294, + "timestamp": "2025-09-30 22:13:18.155698", + "step": 2371, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.215358", + "step": 2371, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002534597646445036, + "timestamp": "2025-09-30 22:13:18.222732", + "step": 2372, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.287719", + "step": 2372, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04354690760374069, + "timestamp": "2025-09-30 22:13:18.292228", + "step": 2373, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.348467", + "step": 2373, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017217475920915604, + "timestamp": "2025-09-30 22:13:18.352618", + "step": 2374, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.411457", + "step": 2374, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007565335836261511, + "timestamp": "2025-09-30 22:13:18.415699", + "step": 2375, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.472300", + "step": 2375, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026072219014167786, + "timestamp": "2025-09-30 22:13:18.480814", + "step": 2376, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.548341", + "step": 2376, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003426198090892285, + "timestamp": "2025-09-30 22:13:18.552600", + "step": 2377, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.609365", + "step": 2377, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.07469062507152557, + "timestamp": "2025-09-30 22:13:18.613435", + "step": 2378, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:18.677596", + "step": 2378, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023098181933164597, + "timestamp": "2025-09-30 22:13:18.680446", + "step": 2379, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.742608", + "step": 2379, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0065413895063102245, + "timestamp": "2025-09-30 22:13:18.749044", + "step": 2380, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.804305", + "step": 2380, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009016537107527256, + "timestamp": "2025-09-30 22:13:18.808806", + "step": 2381, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:18.872537", + "step": 2381, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005769102368503809, + "timestamp": "2025-09-30 22:13:18.883286", + "step": 2382, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.939108", + "step": 2382, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03586021065711975, + "timestamp": "2025-09-30 22:13:18.943050", + "step": 2383, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:18.999422", + "step": 2383, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.032269734889268875, + "timestamp": "2025-09-30 22:13:19.006351", + "step": 2384, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:19.069159", + "step": 2384, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00165548047516495, + "timestamp": "2025-09-30 22:13:19.072039", + "step": 2385, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.131862", + "step": 2385, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024231227580457926, + "timestamp": "2025-09-30 22:13:19.134492", + "step": 2386, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:19.198543", + "step": 2386, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04199283942580223, + "timestamp": "2025-09-30 22:13:19.201655", + "step": 2387, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.257463", + "step": 2387, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03284038230776787, + "timestamp": "2025-09-30 22:13:19.273136", + "step": 2388, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:19.330743", + "step": 2388, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017338943434879184, + "timestamp": "2025-09-30 22:13:19.333819", + "step": 2389, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.390251", + "step": 2389, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004067179746925831, + "timestamp": "2025-09-30 22:13:19.407477", + "step": 2390, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:19.473602", + "step": 2390, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002591915661469102, + "timestamp": "2025-09-30 22:13:19.482602", + "step": 2391, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.543242", + "step": 2391, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006193013396114111, + "timestamp": "2025-09-30 22:13:19.550256", + "step": 2392, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.613204", + "step": 2392, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018228931352496147, + "timestamp": "2025-09-30 22:13:19.616517", + "step": 2393, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:19.673910", + "step": 2393, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011682414915412664, + "timestamp": "2025-09-30 22:13:19.676994", + "step": 2394, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:21.124634", + "step": 2394, + "epoch": 3 + }, + { + "type": "pplx", + "content": 32188811.96573708, + "timestamp": "2025-09-30 22:13:21.128270", + "step": 2394, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.183488", + "step": 2394, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005388319492340088, + "timestamp": "2025-09-30 22:13:21.187245", + "step": 2395, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.247593", + "step": 2395, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013586322776973248, + "timestamp": "2025-09-30 22:13:21.254124", + "step": 2396, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.312845", + "step": 2396, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006401594262570143, + "timestamp": "2025-09-30 22:13:21.321760", + "step": 2397, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.379109", + "step": 2397, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009359225630760193, + "timestamp": "2025-09-30 22:13:21.381831", + "step": 2398, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:21.444723", + "step": 2398, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021020669490098953, + "timestamp": "2025-09-30 22:13:21.448756", + "step": 2399, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.512288", + "step": 2399, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012792134657502174, + "timestamp": "2025-09-30 22:13:21.518632", + "step": 2400, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.573224", + "step": 2400, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004996354691684246, + "timestamp": "2025-09-30 22:13:21.575642", + "step": 2401, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:21.637750", + "step": 2401, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029606416821479797, + "timestamp": "2025-09-30 22:13:21.642603", + "step": 2402, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.700319", + "step": 2402, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006963818334043026, + "timestamp": "2025-09-30 22:13:21.703901", + "step": 2403, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.761290", + "step": 2403, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031231886241585016, + "timestamp": "2025-09-30 22:13:21.768141", + "step": 2404, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.832875", + "step": 2404, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005416753236204386, + "timestamp": "2025-09-30 22:13:21.836985", + "step": 2405, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:21.901336", + "step": 2405, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007782486267387867, + "timestamp": "2025-09-30 22:13:21.914547", + "step": 2406, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:21.973355", + "step": 2406, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013428935781121254, + "timestamp": "2025-09-30 22:13:21.976497", + "step": 2407, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.032826", + "step": 2407, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016297975555062294, + "timestamp": "2025-09-30 22:13:22.040439", + "step": 2408, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.100144", + "step": 2408, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00910912174731493, + "timestamp": "2025-09-30 22:13:22.105263", + "step": 2409, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.160966", + "step": 2409, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011663010343909264, + "timestamp": "2025-09-30 22:13:22.170743", + "step": 2410, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:22.228090", + "step": 2410, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010669506154954433, + "timestamp": "2025-09-30 22:13:22.230710", + "step": 2411, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:22.289260", + "step": 2411, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009635014459490776, + "timestamp": "2025-09-30 22:13:22.295350", + "step": 2412, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.355178", + "step": 2412, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006291415076702833, + "timestamp": "2025-09-30 22:13:22.358455", + "step": 2413, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.416008", + "step": 2413, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005919110961258411, + "timestamp": "2025-09-30 22:13:22.424105", + "step": 2414, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:22.480075", + "step": 2414, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009971419349312782, + "timestamp": "2025-09-30 22:13:22.482487", + "step": 2415, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.538731", + "step": 2415, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006832667160779238, + "timestamp": "2025-09-30 22:13:22.545680", + "step": 2416, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:22.601841", + "step": 2416, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012653900310397148, + "timestamp": "2025-09-30 22:13:22.604658", + "step": 2417, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:22.663734", + "step": 2417, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009062431752681732, + "timestamp": "2025-09-30 22:13:22.667671", + "step": 2418, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.732330", + "step": 2418, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0064752777107059956, + "timestamp": "2025-09-30 22:13:22.737635", + "step": 2419, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.798336", + "step": 2419, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003598024370148778, + "timestamp": "2025-09-30 22:13:22.806812", + "step": 2420, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.865289", + "step": 2420, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002874630270525813, + "timestamp": "2025-09-30 22:13:22.869382", + "step": 2421, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:22.925654", + "step": 2421, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03282652422785759, + "timestamp": "2025-09-30 22:13:22.929496", + "step": 2422, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:22.984177", + "step": 2422, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013241315260529518, + "timestamp": "2025-09-30 22:13:22.987735", + "step": 2423, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.051284", + "step": 2423, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014345109462738037, + "timestamp": "2025-09-30 22:13:23.059332", + "step": 2424, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 176 + ], + "flops": 3520021436096.0 + }, + "timestamp": "2025-09-30 22:13:23.121038", + "step": 2424, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015767786651849747, + "timestamp": "2025-09-30 22:13:23.123336", + "step": 2425, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.184027", + "step": 2425, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004914857912808657, + "timestamp": "2025-09-30 22:13:23.190312", + "step": 2426, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.247883", + "step": 2426, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021198097616434097, + "timestamp": "2025-09-30 22:13:23.252020", + "step": 2427, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:23.314039", + "step": 2427, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000634256808552891, + "timestamp": "2025-09-30 22:13:23.320233", + "step": 2428, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.373727", + "step": 2428, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010591315105557442, + "timestamp": "2025-09-30 22:13:23.376300", + "step": 2429, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.434026", + "step": 2429, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019355354830622673, + "timestamp": "2025-09-30 22:13:23.443163", + "step": 2430, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:23.505531", + "step": 2430, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01241142489016056, + "timestamp": "2025-09-30 22:13:23.511100", + "step": 2431, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.567662", + "step": 2431, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009565332904458046, + "timestamp": "2025-09-30 22:13:23.575405", + "step": 2432, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:23.632837", + "step": 2432, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025015641003847122, + "timestamp": "2025-09-30 22:13:23.635530", + "step": 2433, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.697422", + "step": 2433, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006486161146312952, + "timestamp": "2025-09-30 22:13:23.709455", + "step": 2434, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.771333", + "step": 2434, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011576437391340733, + "timestamp": "2025-09-30 22:13:23.776736", + "step": 2435, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.837294", + "step": 2435, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015482393093407154, + "timestamp": "2025-09-30 22:13:23.844023", + "step": 2436, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.899133", + "step": 2436, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003182504326105118, + "timestamp": "2025-09-30 22:13:23.904724", + "step": 2437, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:23.964847", + "step": 2437, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011627678759396076, + "timestamp": "2025-09-30 22:13:23.968295", + "step": 2438, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.025865", + "step": 2438, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038793303538113832, + "timestamp": "2025-09-30 22:13:24.032278", + "step": 2439, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:24.096569", + "step": 2439, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005496421363204718, + "timestamp": "2025-09-30 22:13:24.108928", + "step": 2440, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.169991", + "step": 2440, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012969347648322582, + "timestamp": "2025-09-30 22:13:24.176292", + "step": 2441, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:24.237921", + "step": 2441, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005602761637419462, + "timestamp": "2025-09-30 22:13:24.242046", + "step": 2442, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.305589", + "step": 2442, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00627124821767211, + "timestamp": "2025-09-30 22:13:24.309441", + "step": 2443, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:24.369126", + "step": 2443, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01086896751075983, + "timestamp": "2025-09-30 22:13:24.377066", + "step": 2444, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.430661", + "step": 2444, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026062259450554848, + "timestamp": "2025-09-30 22:13:24.434410", + "step": 2445, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.496901", + "step": 2445, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012748646549880505, + "timestamp": "2025-09-30 22:13:24.499871", + "step": 2446, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.560400", + "step": 2446, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018161652609705925, + "timestamp": "2025-09-30 22:13:24.567688", + "step": 2447, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.621315", + "step": 2447, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006955179385840893, + "timestamp": "2025-09-30 22:13:24.627375", + "step": 2448, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.684306", + "step": 2448, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011628582142293453, + "timestamp": "2025-09-30 22:13:24.688144", + "step": 2449, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:24.748192", + "step": 2449, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00847693346440792, + "timestamp": "2025-09-30 22:13:24.751297", + "step": 2450, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:24.811240", + "step": 2450, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017132606357336044, + "timestamp": "2025-09-30 22:13:24.814566", + "step": 2451, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:26.264249", + "step": 2451, + "epoch": 3 + }, + { + "type": "pplx", + "content": 28315239.44436738, + "timestamp": "2025-09-30 22:13:26.268190", + "step": 2451, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.323933", + "step": 2451, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03608303517103195, + "timestamp": "2025-09-30 22:13:26.329858", + "step": 2452, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.390834", + "step": 2452, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005534134339541197, + "timestamp": "2025-09-30 22:13:26.397219", + "step": 2453, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.465956", + "step": 2453, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003311963053420186, + "timestamp": "2025-09-30 22:13:26.470711", + "step": 2454, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:26.534839", + "step": 2454, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03324703872203827, + "timestamp": "2025-09-30 22:13:26.539042", + "step": 2455, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:26.598664", + "step": 2455, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027091566007584333, + "timestamp": "2025-09-30 22:13:26.606372", + "step": 2456, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.666515", + "step": 2456, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012049831449985504, + "timestamp": "2025-09-30 22:13:26.669406", + "step": 2457, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.725248", + "step": 2457, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007934476248919964, + "timestamp": "2025-09-30 22:13:26.728122", + "step": 2458, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.784509", + "step": 2458, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0067762285470962524, + "timestamp": "2025-09-30 22:13:26.787065", + "step": 2459, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.846731", + "step": 2459, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006125275394879282, + "timestamp": "2025-09-30 22:13:26.852534", + "step": 2460, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.909867", + "step": 2460, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001969063188880682, + "timestamp": "2025-09-30 22:13:26.919777", + "step": 2461, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:26.982301", + "step": 2461, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01936742290854454, + "timestamp": "2025-09-30 22:13:26.985867", + "step": 2462, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.056042", + "step": 2462, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02352396585047245, + "timestamp": "2025-09-30 22:13:27.062482", + "step": 2463, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.118380", + "step": 2463, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021059229969978333, + "timestamp": "2025-09-30 22:13:27.124270", + "step": 2464, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.185631", + "step": 2464, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014644553884863853, + "timestamp": "2025-09-30 22:13:27.191249", + "step": 2465, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:27.261864", + "step": 2465, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002741077449172735, + "timestamp": "2025-09-30 22:13:27.264795", + "step": 2466, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.320924", + "step": 2466, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003315993817523122, + "timestamp": "2025-09-30 22:13:27.324832", + "step": 2467, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.380192", + "step": 2467, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016506491228938103, + "timestamp": "2025-09-30 22:13:27.387391", + "step": 2468, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:27.450382", + "step": 2468, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005784686654806137, + "timestamp": "2025-09-30 22:13:27.453645", + "step": 2469, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.515525", + "step": 2469, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024664176627993584, + "timestamp": "2025-09-30 22:13:27.518250", + "step": 2470, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:27.572542", + "step": 2470, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00011582062143133953, + "timestamp": "2025-09-30 22:13:27.578850", + "step": 2471, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.637005", + "step": 2471, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007763525936752558, + "timestamp": "2025-09-30 22:13:27.646004", + "step": 2472, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.702172", + "step": 2472, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010659711435437202, + "timestamp": "2025-09-30 22:13:27.706063", + "step": 2473, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.777540", + "step": 2473, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018466237233951688, + "timestamp": "2025-09-30 22:13:27.779959", + "step": 2474, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.852539", + "step": 2474, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035447957925498486, + "timestamp": "2025-09-30 22:13:27.859813", + "step": 2475, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:27.920238", + "step": 2475, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005354622844606638, + "timestamp": "2025-09-30 22:13:27.926963", + "step": 2476, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:27.989674", + "step": 2476, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005044917925260961, + "timestamp": "2025-09-30 22:13:27.992747", + "step": 2477, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:28.048142", + "step": 2477, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02057606168091297, + "timestamp": "2025-09-30 22:13:28.051060", + "step": 2478, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.108531", + "step": 2478, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00041592912748456, + "timestamp": "2025-09-30 22:13:28.110908", + "step": 2479, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.173240", + "step": 2479, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02707737870514393, + "timestamp": "2025-09-30 22:13:28.179409", + "step": 2480, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:28.234239", + "step": 2480, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008141888305544853, + "timestamp": "2025-09-30 22:13:28.237098", + "step": 2481, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:28.294134", + "step": 2481, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005826229462400079, + "timestamp": "2025-09-30 22:13:28.297607", + "step": 2482, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:28.354318", + "step": 2482, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010274297092109919, + "timestamp": "2025-09-30 22:13:28.358929", + "step": 2483, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:28.416658", + "step": 2483, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00012387447350192815, + "timestamp": "2025-09-30 22:13:28.424114", + "step": 2484, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.486895", + "step": 2484, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0458819679915905, + "timestamp": "2025-09-30 22:13:28.489480", + "step": 2485, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.545121", + "step": 2485, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024083128664642572, + "timestamp": "2025-09-30 22:13:28.549784", + "step": 2486, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.612063", + "step": 2486, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018058612942695618, + "timestamp": "2025-09-30 22:13:28.615950", + "step": 2487, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.677326", + "step": 2487, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016298463568091393, + "timestamp": "2025-09-30 22:13:28.683260", + "step": 2488, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:28.739919", + "step": 2488, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007573925890028477, + "timestamp": "2025-09-30 22:13:28.742847", + "step": 2489, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.802419", + "step": 2489, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00014498885138891637, + "timestamp": "2025-09-30 22:13:28.807170", + "step": 2490, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:28.872060", + "step": 2490, + "epoch": 3 + }, + { + "type": "loss", + "content": 6.842377479188144e-05, + "timestamp": "2025-09-30 22:13:28.879991", + "step": 2491, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:28.951699", + "step": 2491, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027848348021507263, + "timestamp": "2025-09-30 22:13:28.958119", + "step": 2492, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:29.023111", + "step": 2492, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003194676246494055, + "timestamp": "2025-09-30 22:13:29.026405", + "step": 2493, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.085635", + "step": 2493, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004121209029108286, + "timestamp": "2025-09-30 22:13:29.091479", + "step": 2494, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.151939", + "step": 2494, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030184591189026833, + "timestamp": "2025-09-30 22:13:29.154364", + "step": 2495, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.213774", + "step": 2495, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00010497510811546817, + "timestamp": "2025-09-30 22:13:29.225978", + "step": 2496, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.288120", + "step": 2496, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006152989808470011, + "timestamp": "2025-09-30 22:13:29.298523", + "step": 2497, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:29.366801", + "step": 2497, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010256282053887844, + "timestamp": "2025-09-30 22:13:29.378777", + "step": 2498, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.443430", + "step": 2498, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030539052095264196, + "timestamp": "2025-09-30 22:13:29.445695", + "step": 2499, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:29.502248", + "step": 2499, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005430816672742367, + "timestamp": "2025-09-30 22:13:29.512673", + "step": 2500, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2500", + "timestamp": "2025-09-30 22:13:29.992354", + "step": 2500, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.056711", + "step": 2500, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053467415273189545, + "timestamp": "2025-09-30 22:13:30.068444", + "step": 2501, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.125796", + "step": 2501, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011339363642036915, + "timestamp": "2025-09-30 22:13:30.131339", + "step": 2502, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.192817", + "step": 2502, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010253089480102062, + "timestamp": "2025-09-30 22:13:30.202343", + "step": 2503, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:30.266346", + "step": 2503, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04441189020872116, + "timestamp": "2025-09-30 22:13:30.284633", + "step": 2504, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.343260", + "step": 2504, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009698430076241493, + "timestamp": "2025-09-30 22:13:30.346379", + "step": 2505, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.415313", + "step": 2505, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008782203309237957, + "timestamp": "2025-09-30 22:13:30.421399", + "step": 2506, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.488357", + "step": 2506, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034844086039811373, + "timestamp": "2025-09-30 22:13:30.495408", + "step": 2507, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:30.561594", + "step": 2507, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02779925987124443, + "timestamp": "2025-09-30 22:13:30.570543", + "step": 2508, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:32.146436", + "step": 2508, + "epoch": 3 + }, + { + "type": "pplx", + "content": 28484982.005562108, + "timestamp": "2025-09-30 22:13:32.149120", + "step": 2508, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.201869", + "step": 2508, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04372706264257431, + "timestamp": "2025-09-30 22:13:32.204873", + "step": 2509, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.277804", + "step": 2509, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006275986321270466, + "timestamp": "2025-09-30 22:13:32.294723", + "step": 2510, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.372737", + "step": 2510, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007544786669313908, + "timestamp": "2025-09-30 22:13:32.395531", + "step": 2511, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:32.472031", + "step": 2511, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030385946854948997, + "timestamp": "2025-09-30 22:13:32.497664", + "step": 2512, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.576892", + "step": 2512, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02495487593114376, + "timestamp": "2025-09-30 22:13:32.608326", + "step": 2513, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.686036", + "step": 2513, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005949577782303095, + "timestamp": "2025-09-30 22:13:32.709754", + "step": 2514, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:32.779864", + "step": 2514, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016375141218304634, + "timestamp": "2025-09-30 22:13:32.800113", + "step": 2515, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:32.881676", + "step": 2515, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03323351964354515, + "timestamp": "2025-09-30 22:13:32.895106", + "step": 2516, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:32.968779", + "step": 2516, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010078134015202522, + "timestamp": "2025-09-30 22:13:32.983705", + "step": 2517, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.056435", + "step": 2517, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005874228663742542, + "timestamp": "2025-09-30 22:13:33.068804", + "step": 2518, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.139380", + "step": 2518, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026764482259750366, + "timestamp": "2025-09-30 22:13:33.150956", + "step": 2519, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:33.215944", + "step": 2519, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007020972203463316, + "timestamp": "2025-09-30 22:13:33.230128", + "step": 2520, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.306129", + "step": 2520, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002482133684679866, + "timestamp": "2025-09-30 22:13:33.321965", + "step": 2521, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:33.407397", + "step": 2521, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031678727827966213, + "timestamp": "2025-09-30 22:13:33.422133", + "step": 2522, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:33.491337", + "step": 2522, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03531847894191742, + "timestamp": "2025-09-30 22:13:33.512412", + "step": 2523, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.578179", + "step": 2523, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012394100427627563, + "timestamp": "2025-09-30 22:13:33.600136", + "step": 2524, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.666404", + "step": 2524, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011437847279012203, + "timestamp": "2025-09-30 22:13:33.682021", + "step": 2525, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:33.771138", + "step": 2525, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002483924152329564, + "timestamp": "2025-09-30 22:13:33.791110", + "step": 2526, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:33.863813", + "step": 2526, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01781102456152439, + "timestamp": "2025-09-30 22:13:33.879117", + "step": 2527, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:33.948914", + "step": 2527, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02498701587319374, + "timestamp": "2025-09-30 22:13:33.974229", + "step": 2528, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.047825", + "step": 2528, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004967492539435625, + "timestamp": "2025-09-30 22:13:34.057256", + "step": 2529, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:34.120092", + "step": 2529, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008416562341153622, + "timestamp": "2025-09-30 22:13:34.124286", + "step": 2530, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.180050", + "step": 2530, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004394114948809147, + "timestamp": "2025-09-30 22:13:34.188245", + "step": 2531, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.255255", + "step": 2531, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030704778619110584, + "timestamp": "2025-09-30 22:13:34.262524", + "step": 2532, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.322691", + "step": 2532, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021440336480736732, + "timestamp": "2025-09-30 22:13:34.325981", + "step": 2533, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.381080", + "step": 2533, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013712027110159397, + "timestamp": "2025-09-30 22:13:34.389556", + "step": 2534, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:34.449231", + "step": 2534, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006967604160308838, + "timestamp": "2025-09-30 22:13:34.457892", + "step": 2535, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:34.518202", + "step": 2535, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003231785027310252, + "timestamp": "2025-09-30 22:13:34.529400", + "step": 2536, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:34.584782", + "step": 2536, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018372103571891785, + "timestamp": "2025-09-30 22:13:34.594720", + "step": 2537, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.652122", + "step": 2537, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005035504698753357, + "timestamp": "2025-09-30 22:13:34.660471", + "step": 2538, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:34.717692", + "step": 2538, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008455309085547924, + "timestamp": "2025-09-30 22:13:34.720813", + "step": 2539, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:34.778433", + "step": 2539, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000451569736469537, + "timestamp": "2025-09-30 22:13:34.785623", + "step": 2540, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:34.857888", + "step": 2540, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005161632434464991, + "timestamp": "2025-09-30 22:13:34.861478", + "step": 2541, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:34.929400", + "step": 2541, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004660214763134718, + "timestamp": "2025-09-30 22:13:34.931747", + "step": 2542, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.002296", + "step": 2542, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005621172953397036, + "timestamp": "2025-09-30 22:13:35.004815", + "step": 2543, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.062321", + "step": 2543, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02957882173359394, + "timestamp": "2025-09-30 22:13:35.069169", + "step": 2544, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:35.140995", + "step": 2544, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009165803901851177, + "timestamp": "2025-09-30 22:13:35.143954", + "step": 2545, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.201122", + "step": 2545, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02128801867365837, + "timestamp": "2025-09-30 22:13:35.208203", + "step": 2546, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.268216", + "step": 2546, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005340006668120623, + "timestamp": "2025-09-30 22:13:35.276293", + "step": 2547, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.340625", + "step": 2547, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007607621140778065, + "timestamp": "2025-09-30 22:13:35.348466", + "step": 2548, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:35.415112", + "step": 2548, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021063433960080147, + "timestamp": "2025-09-30 22:13:35.418212", + "step": 2549, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.482643", + "step": 2549, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011848625726997852, + "timestamp": "2025-09-30 22:13:35.486294", + "step": 2550, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.553913", + "step": 2550, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010470031760632992, + "timestamp": "2025-09-30 22:13:35.557896", + "step": 2551, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.614322", + "step": 2551, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007319572381675243, + "timestamp": "2025-09-30 22:13:35.625177", + "step": 2552, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.683824", + "step": 2552, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002631863346323371, + "timestamp": "2025-09-30 22:13:35.686736", + "step": 2553, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:35.743155", + "step": 2553, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026114294305443764, + "timestamp": "2025-09-30 22:13:35.745574", + "step": 2554, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.799953", + "step": 2554, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004028195107821375, + "timestamp": "2025-09-30 22:13:35.802048", + "step": 2555, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.861107", + "step": 2555, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005155097460374236, + "timestamp": "2025-09-30 22:13:35.868717", + "step": 2556, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.931861", + "step": 2556, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014193429611623287, + "timestamp": "2025-09-30 22:13:35.935249", + "step": 2557, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:35.991284", + "step": 2557, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023700231686234474, + "timestamp": "2025-09-30 22:13:35.997758", + "step": 2558, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:36.057147", + "step": 2558, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013391217216849327, + "timestamp": "2025-09-30 22:13:36.060664", + "step": 2559, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:36.126611", + "step": 2559, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035434234887361526, + "timestamp": "2025-09-30 22:13:36.132365", + "step": 2560, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:36.191606", + "step": 2560, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02454542927443981, + "timestamp": "2025-09-30 22:13:36.199936", + "step": 2561, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:36.279053", + "step": 2561, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029066719580441713, + "timestamp": "2025-09-30 22:13:36.287710", + "step": 2562, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:36.365339", + "step": 2562, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006104097701609135, + "timestamp": "2025-09-30 22:13:36.368124", + "step": 2563, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:36.428303", + "step": 2563, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010215085931122303, + "timestamp": "2025-09-30 22:13:36.437744", + "step": 2564, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:36.503906", + "step": 2564, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005062797572463751, + "timestamp": "2025-09-30 22:13:36.509952", + "step": 2565, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:37.982199", + "step": 2565, + "epoch": 3 + }, + { + "type": "pplx", + "content": 28095844.67479086, + "timestamp": "2025-09-30 22:13:37.984464", + "step": 2565, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.037632", + "step": 2565, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010721017606556416, + "timestamp": "2025-09-30 22:13:38.039899", + "step": 2566, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:38.108144", + "step": 2566, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003516018856316805, + "timestamp": "2025-09-30 22:13:38.110577", + "step": 2567, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:38.167050", + "step": 2567, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029169064946472645, + "timestamp": "2025-09-30 22:13:38.174014", + "step": 2568, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:38.237774", + "step": 2568, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035148721653968096, + "timestamp": "2025-09-30 22:13:38.240537", + "step": 2569, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.296057", + "step": 2569, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.033282749354839325, + "timestamp": "2025-09-30 22:13:38.300103", + "step": 2570, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.361289", + "step": 2570, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010799924843013287, + "timestamp": "2025-09-30 22:13:38.363894", + "step": 2571, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.428150", + "step": 2571, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005024234298616648, + "timestamp": "2025-09-30 22:13:38.434615", + "step": 2572, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.494982", + "step": 2572, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024888780899345875, + "timestamp": "2025-09-30 22:13:38.497928", + "step": 2573, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.564411", + "step": 2573, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014496888034045696, + "timestamp": "2025-09-30 22:13:38.573061", + "step": 2574, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:38.629771", + "step": 2574, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006988645065575838, + "timestamp": "2025-09-30 22:13:38.637345", + "step": 2575, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:38.697004", + "step": 2575, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01132346410304308, + "timestamp": "2025-09-30 22:13:38.703263", + "step": 2576, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.764133", + "step": 2576, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035836149472743273, + "timestamp": "2025-09-30 22:13:38.767094", + "step": 2577, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:38.830451", + "step": 2577, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013794556260108948, + "timestamp": "2025-09-30 22:13:38.833653", + "step": 2578, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:38.892206", + "step": 2578, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004456694237887859, + "timestamp": "2025-09-30 22:13:38.895467", + "step": 2579, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:38.958609", + "step": 2579, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006100859493017197, + "timestamp": "2025-09-30 22:13:38.965114", + "step": 2580, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.029425", + "step": 2580, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033564306795597076, + "timestamp": "2025-09-30 22:13:39.031433", + "step": 2581, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.088665", + "step": 2581, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037206211127340794, + "timestamp": "2025-09-30 22:13:39.095796", + "step": 2582, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.153293", + "step": 2582, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006556731648743153, + "timestamp": "2025-09-30 22:13:39.155960", + "step": 2583, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.210529", + "step": 2583, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01759198307991028, + "timestamp": "2025-09-30 22:13:39.216788", + "step": 2584, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.271308", + "step": 2584, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012082146480679512, + "timestamp": "2025-09-30 22:13:39.273747", + "step": 2585, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.328828", + "step": 2585, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.034108925610780716, + "timestamp": "2025-09-30 22:13:39.331233", + "step": 2586, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.389845", + "step": 2586, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003907191567122936, + "timestamp": "2025-09-30 22:13:39.403544", + "step": 2587, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:39.465603", + "step": 2587, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009326784871518612, + "timestamp": "2025-09-30 22:13:39.472112", + "step": 2588, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:39.531850", + "step": 2588, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008430427871644497, + "timestamp": "2025-09-30 22:13:39.535223", + "step": 2589, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.599643", + "step": 2589, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005537763237953186, + "timestamp": "2025-09-30 22:13:39.602815", + "step": 2590, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.659746", + "step": 2590, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00847475416958332, + "timestamp": "2025-09-30 22:13:39.664121", + "step": 2591, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.726143", + "step": 2591, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008981380960904062, + "timestamp": "2025-09-30 22:13:39.736910", + "step": 2592, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:39.801757", + "step": 2592, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013419969473034143, + "timestamp": "2025-09-30 22:13:39.805646", + "step": 2593, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.860221", + "step": 2593, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038829329423606396, + "timestamp": "2025-09-30 22:13:39.870646", + "step": 2594, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:39.925607", + "step": 2594, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0077403089962899685, + "timestamp": "2025-09-30 22:13:39.928827", + "step": 2595, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 160 + ], + "flops": 3200019493504.0 + }, + "timestamp": "2025-09-30 22:13:39.984471", + "step": 2595, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013570351526141167, + "timestamp": "2025-09-30 22:13:39.990851", + "step": 2596, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.054183", + "step": 2596, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005910519044846296, + "timestamp": "2025-09-30 22:13:40.057403", + "step": 2597, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.115868", + "step": 2597, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021806147415190935, + "timestamp": "2025-09-30 22:13:40.119422", + "step": 2598, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.176335", + "step": 2598, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002977479714900255, + "timestamp": "2025-09-30 22:13:40.184658", + "step": 2599, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.248724", + "step": 2599, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011497695231810212, + "timestamp": "2025-09-30 22:13:40.259275", + "step": 2600, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.316010", + "step": 2600, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021200214978307486, + "timestamp": "2025-09-30 22:13:40.322723", + "step": 2601, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.384991", + "step": 2601, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003689644392579794, + "timestamp": "2025-09-30 22:13:40.388971", + "step": 2602, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.451268", + "step": 2602, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002164714504033327, + "timestamp": "2025-09-30 22:13:40.454859", + "step": 2603, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.516437", + "step": 2603, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024041039869189262, + "timestamp": "2025-09-30 22:13:40.523219", + "step": 2604, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:40.581524", + "step": 2604, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.053784094750881195, + "timestamp": "2025-09-30 22:13:40.587466", + "step": 2605, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:40.643075", + "step": 2605, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00025220931274816394, + "timestamp": "2025-09-30 22:13:40.648320", + "step": 2606, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.706291", + "step": 2606, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0412961021065712, + "timestamp": "2025-09-30 22:13:40.709924", + "step": 2607, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:40.768686", + "step": 2607, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008691218099556863, + "timestamp": "2025-09-30 22:13:40.775025", + "step": 2608, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:40.829933", + "step": 2608, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012250705622136593, + "timestamp": "2025-09-30 22:13:40.840134", + "step": 2609, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:40.896800", + "step": 2609, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007466053124517202, + "timestamp": "2025-09-30 22:13:40.899879", + "step": 2610, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:40.962887", + "step": 2610, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008426569984294474, + "timestamp": "2025-09-30 22:13:40.975062", + "step": 2611, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.033291", + "step": 2611, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001527618383988738, + "timestamp": "2025-09-30 22:13:41.040007", + "step": 2612, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:41.094083", + "step": 2612, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05261802300810814, + "timestamp": "2025-09-30 22:13:41.098284", + "step": 2613, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.154590", + "step": 2613, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00033618827001191676, + "timestamp": "2025-09-30 22:13:41.159766", + "step": 2614, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.219906", + "step": 2614, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013994032517075539, + "timestamp": "2025-09-30 22:13:41.222645", + "step": 2615, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:41.277574", + "step": 2615, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001805710606276989, + "timestamp": "2025-09-30 22:13:41.283924", + "step": 2616, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.338699", + "step": 2616, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01090067345649004, + "timestamp": "2025-09-30 22:13:41.341341", + "step": 2617, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:41.395509", + "step": 2617, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010763258673250675, + "timestamp": "2025-09-30 22:13:41.398233", + "step": 2618, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.456411", + "step": 2618, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00023230792430695146, + "timestamp": "2025-09-30 22:13:41.459052", + "step": 2619, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:41.517705", + "step": 2619, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00010520854266360402, + "timestamp": "2025-09-30 22:13:41.526262", + "step": 2620, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.592677", + "step": 2620, + "epoch": 3 + }, + { + "type": "loss", + "content": 8.937703387346119e-05, + "timestamp": "2025-09-30 22:13:41.596741", + "step": 2621, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:41.651606", + "step": 2621, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003324234625324607, + "timestamp": "2025-09-30 22:13:41.656269", + "step": 2622, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:42.991390", + "step": 2622, + "epoch": 3 + }, + { + "type": "pplx", + "content": 31437886.455709014, + "timestamp": "2025-09-30 22:13:42.996628", + "step": 2622, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.052565", + "step": 2622, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005293331108987331, + "timestamp": "2025-09-30 22:13:43.059695", + "step": 2623, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:43.124834", + "step": 2623, + "epoch": 3 + }, + { + "type": "loss", + "content": 9.892590605886653e-05, + "timestamp": "2025-09-30 22:13:43.132164", + "step": 2624, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.192440", + "step": 2624, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.033769041299819946, + "timestamp": "2025-09-30 22:13:43.201469", + "step": 2625, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.266274", + "step": 2625, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.036175686866045, + "timestamp": "2025-09-30 22:13:43.269492", + "step": 2626, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.334734", + "step": 2626, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016300681745633483, + "timestamp": "2025-09-30 22:13:43.349674", + "step": 2627, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.405168", + "step": 2627, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.052250683307647705, + "timestamp": "2025-09-30 22:13:43.412369", + "step": 2628, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.477284", + "step": 2628, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003264912636950612, + "timestamp": "2025-09-30 22:13:43.480096", + "step": 2629, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.539111", + "step": 2629, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006807534955441952, + "timestamp": "2025-09-30 22:13:43.544956", + "step": 2630, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.601818", + "step": 2630, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006088690715841949, + "timestamp": "2025-09-30 22:13:43.610272", + "step": 2631, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:43.673117", + "step": 2631, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006588431540876627, + "timestamp": "2025-09-30 22:13:43.691680", + "step": 2632, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.747030", + "step": 2632, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006200451171025634, + "timestamp": "2025-09-30 22:13:43.750018", + "step": 2633, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.811484", + "step": 2633, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01981678046286106, + "timestamp": "2025-09-30 22:13:43.814840", + "step": 2634, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:43.873252", + "step": 2634, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003335049608722329, + "timestamp": "2025-09-30 22:13:43.875994", + "step": 2635, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:43.939080", + "step": 2635, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02097000740468502, + "timestamp": "2025-09-30 22:13:43.951340", + "step": 2636, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.008281", + "step": 2636, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007635528687387705, + "timestamp": "2025-09-30 22:13:44.014140", + "step": 2637, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.072624", + "step": 2637, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021020916756242514, + "timestamp": "2025-09-30 22:13:44.078675", + "step": 2638, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.142189", + "step": 2638, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00463180523365736, + "timestamp": "2025-09-30 22:13:44.145158", + "step": 2639, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.207719", + "step": 2639, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00478312699124217, + "timestamp": "2025-09-30 22:13:44.215952", + "step": 2640, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.272292", + "step": 2640, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008612548117525876, + "timestamp": "2025-09-30 22:13:44.276660", + "step": 2641, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:44.344905", + "step": 2641, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024010143242776394, + "timestamp": "2025-09-30 22:13:44.357248", + "step": 2642, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:44.423670", + "step": 2642, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003124420763924718, + "timestamp": "2025-09-30 22:13:44.427994", + "step": 2643, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:44.495267", + "step": 2643, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013605816289782524, + "timestamp": "2025-09-30 22:13:44.502640", + "step": 2644, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.560412", + "step": 2644, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009547712397761643, + "timestamp": "2025-09-30 22:13:44.572377", + "step": 2645, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.628288", + "step": 2645, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024611575063318014, + "timestamp": "2025-09-30 22:13:44.632917", + "step": 2646, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.693593", + "step": 2646, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010525861755013466, + "timestamp": "2025-09-30 22:13:44.701639", + "step": 2647, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:44.761423", + "step": 2647, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014524830505251884, + "timestamp": "2025-09-30 22:13:44.768375", + "step": 2648, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:44.830634", + "step": 2648, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008129668422043324, + "timestamp": "2025-09-30 22:13:44.836483", + "step": 2649, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:44.893886", + "step": 2649, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004618306644260883, + "timestamp": "2025-09-30 22:13:44.896386", + "step": 2650, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:44.953263", + "step": 2650, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023160020355135202, + "timestamp": "2025-09-30 22:13:44.955676", + "step": 2651, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.012135", + "step": 2651, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021287633571773767, + "timestamp": "2025-09-30 22:13:45.020394", + "step": 2652, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.075628", + "step": 2652, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015038742683827877, + "timestamp": "2025-09-30 22:13:45.082391", + "step": 2653, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.137061", + "step": 2653, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0270217452198267, + "timestamp": "2025-09-30 22:13:45.142322", + "step": 2654, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:45.202368", + "step": 2654, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05145685747265816, + "timestamp": "2025-09-30 22:13:45.206286", + "step": 2655, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:45.265867", + "step": 2655, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008141874335706234, + "timestamp": "2025-09-30 22:13:45.272859", + "step": 2656, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:45.327084", + "step": 2656, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002790646394714713, + "timestamp": "2025-09-30 22:13:45.335543", + "step": 2657, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:45.394029", + "step": 2657, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008883380331099033, + "timestamp": "2025-09-30 22:13:45.396873", + "step": 2658, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:45.459598", + "step": 2658, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006654282100498676, + "timestamp": "2025-09-30 22:13:45.466734", + "step": 2659, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.533117", + "step": 2659, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01170284952968359, + "timestamp": "2025-09-30 22:13:45.539352", + "step": 2660, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:45.593881", + "step": 2660, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007521068211644888, + "timestamp": "2025-09-30 22:13:45.597392", + "step": 2661, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:45.658090", + "step": 2661, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007094702683389187, + "timestamp": "2025-09-30 22:13:45.661569", + "step": 2662, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.716779", + "step": 2662, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008204026147723198, + "timestamp": "2025-09-30 22:13:45.719813", + "step": 2663, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.776614", + "step": 2663, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011364804580807686, + "timestamp": "2025-09-30 22:13:45.783795", + "step": 2664, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.840029", + "step": 2664, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015917399898171425, + "timestamp": "2025-09-30 22:13:45.842921", + "step": 2665, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.905060", + "step": 2665, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006572159472852945, + "timestamp": "2025-09-30 22:13:45.914197", + "step": 2666, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:45.975171", + "step": 2666, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034901797771453857, + "timestamp": "2025-09-30 22:13:45.977763", + "step": 2667, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:46.035916", + "step": 2667, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014389263466000557, + "timestamp": "2025-09-30 22:13:46.042903", + "step": 2668, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:46.099722", + "step": 2668, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002424341393634677, + "timestamp": "2025-09-30 22:13:46.106840", + "step": 2669, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.166856", + "step": 2669, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01114154327660799, + "timestamp": "2025-09-30 22:13:46.169241", + "step": 2670, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:46.223757", + "step": 2670, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023684168234467506, + "timestamp": "2025-09-30 22:13:46.226315", + "step": 2671, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.283561", + "step": 2671, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002832164289429784, + "timestamp": "2025-09-30 22:13:46.299391", + "step": 2672, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.362243", + "step": 2672, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008885924704372883, + "timestamp": "2025-09-30 22:13:46.367794", + "step": 2673, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:46.430281", + "step": 2673, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018890589708462358, + "timestamp": "2025-09-30 22:13:46.433980", + "step": 2674, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.491992", + "step": 2674, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004877452738583088, + "timestamp": "2025-09-30 22:13:46.503349", + "step": 2675, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.572081", + "step": 2675, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004309082869440317, + "timestamp": "2025-09-30 22:13:46.578554", + "step": 2676, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.639190", + "step": 2676, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009656992740929127, + "timestamp": "2025-09-30 22:13:46.642116", + "step": 2677, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:46.699896", + "step": 2677, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020586643368005753, + "timestamp": "2025-09-30 22:13:46.702389", + "step": 2678, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:46.761163", + "step": 2678, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008080621482804418, + "timestamp": "2025-09-30 22:13:46.770580", + "step": 2679, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:48.223899", + "step": 2679, + "epoch": 3 + }, + { + "type": "pplx", + "content": 27226157.57321593, + "timestamp": "2025-09-30 22:13:48.229459", + "step": 2679, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:48.285279", + "step": 2679, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005241225007921457, + "timestamp": "2025-09-30 22:13:48.291639", + "step": 2680, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.347738", + "step": 2680, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00306515721604228, + "timestamp": "2025-09-30 22:13:48.350274", + "step": 2681, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:48.408822", + "step": 2681, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00246833311393857, + "timestamp": "2025-09-30 22:13:48.411670", + "step": 2682, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:48.465693", + "step": 2682, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012627762043848634, + "timestamp": "2025-09-30 22:13:48.468124", + "step": 2683, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.525025", + "step": 2683, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00040524639189243317, + "timestamp": "2025-09-30 22:13:48.530951", + "step": 2684, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.596407", + "step": 2684, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009502816246822476, + "timestamp": "2025-09-30 22:13:48.606005", + "step": 2685, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:48.671814", + "step": 2685, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007616551127284765, + "timestamp": "2025-09-30 22:13:48.675296", + "step": 2686, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.732914", + "step": 2686, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005208977032452822, + "timestamp": "2025-09-30 22:13:48.736853", + "step": 2687, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:48.806110", + "step": 2687, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008302520145662129, + "timestamp": "2025-09-30 22:13:48.823721", + "step": 2688, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.883982", + "step": 2688, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001742185850162059, + "timestamp": "2025-09-30 22:13:48.887570", + "step": 2689, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:48.956348", + "step": 2689, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018354164203628898, + "timestamp": "2025-09-30 22:13:48.959330", + "step": 2690, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.020243", + "step": 2690, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006067172158509493, + "timestamp": "2025-09-30 22:13:49.023125", + "step": 2691, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:49.083643", + "step": 2691, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023693010210990906, + "timestamp": "2025-09-30 22:13:49.095187", + "step": 2692, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.161575", + "step": 2692, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013803176116198301, + "timestamp": "2025-09-30 22:13:49.168354", + "step": 2693, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.230112", + "step": 2693, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001813818933442235, + "timestamp": "2025-09-30 22:13:49.233718", + "step": 2694, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.293224", + "step": 2694, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011404008837416768, + "timestamp": "2025-09-30 22:13:49.302174", + "step": 2695, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:49.359140", + "step": 2695, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007178679574280977, + "timestamp": "2025-09-30 22:13:49.370459", + "step": 2696, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.429837", + "step": 2696, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011296061566099524, + "timestamp": "2025-09-30 22:13:49.438620", + "step": 2697, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.496637", + "step": 2697, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011266040382906795, + "timestamp": "2025-09-30 22:13:49.502242", + "step": 2698, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.561787", + "step": 2698, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006445986800827086, + "timestamp": "2025-09-30 22:13:49.567761", + "step": 2699, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:49.624521", + "step": 2699, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005764567176811397, + "timestamp": "2025-09-30 22:13:49.630654", + "step": 2700, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.696603", + "step": 2700, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010664776200428605, + "timestamp": "2025-09-30 22:13:49.700446", + "step": 2701, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.756793", + "step": 2701, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00033894533407874405, + "timestamp": "2025-09-30 22:13:49.759941", + "step": 2702, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:49.820434", + "step": 2702, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001302658929489553, + "timestamp": "2025-09-30 22:13:49.824764", + "step": 2703, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.879323", + "step": 2703, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003532660659402609, + "timestamp": "2025-09-30 22:13:49.889824", + "step": 2704, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:49.945843", + "step": 2704, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034972601570189, + "timestamp": "2025-09-30 22:13:49.951608", + "step": 2705, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.016545", + "step": 2705, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00021062916493974626, + "timestamp": "2025-09-30 22:13:50.020864", + "step": 2706, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.078562", + "step": 2706, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000698353978805244, + "timestamp": "2025-09-30 22:13:50.081409", + "step": 2707, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.144600", + "step": 2707, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014168920926749706, + "timestamp": "2025-09-30 22:13:50.154442", + "step": 2708, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.212646", + "step": 2708, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037853510584682226, + "timestamp": "2025-09-30 22:13:50.220630", + "step": 2709, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.283443", + "step": 2709, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010242769494652748, + "timestamp": "2025-09-30 22:13:50.290377", + "step": 2710, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:50.344434", + "step": 2710, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002763148513622582, + "timestamp": "2025-09-30 22:13:50.347449", + "step": 2711, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.411418", + "step": 2711, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003090985701419413, + "timestamp": "2025-09-30 22:13:50.428813", + "step": 2712, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.489893", + "step": 2712, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003153016325086355, + "timestamp": "2025-09-30 22:13:50.493106", + "step": 2713, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:50.551946", + "step": 2713, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014958287589251995, + "timestamp": "2025-09-30 22:13:50.555908", + "step": 2714, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.626790", + "step": 2714, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03713773563504219, + "timestamp": "2025-09-30 22:13:50.630948", + "step": 2715, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.690432", + "step": 2715, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021324113477021456, + "timestamp": "2025-09-30 22:13:50.699332", + "step": 2716, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.755210", + "step": 2716, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010403887368738651, + "timestamp": "2025-09-30 22:13:50.758629", + "step": 2717, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:50.822734", + "step": 2717, + "epoch": 3 + }, + { + "type": "loss", + "content": 9.978410525945947e-05, + "timestamp": "2025-09-30 22:13:50.831743", + "step": 2718, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.892991", + "step": 2718, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008979692356660962, + "timestamp": "2025-09-30 22:13:50.896480", + "step": 2719, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:50.952988", + "step": 2719, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012770486064255238, + "timestamp": "2025-09-30 22:13:50.965793", + "step": 2720, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.027607", + "step": 2720, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006667285342700779, + "timestamp": "2025-09-30 22:13:51.031206", + "step": 2721, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.089950", + "step": 2721, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001039764960296452, + "timestamp": "2025-09-30 22:13:51.092464", + "step": 2722, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.157150", + "step": 2722, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007044041994959116, + "timestamp": "2025-09-30 22:13:51.166520", + "step": 2723, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.225414", + "step": 2723, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006314881145954132, + "timestamp": "2025-09-30 22:13:51.231890", + "step": 2724, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.293636", + "step": 2724, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012850704602897167, + "timestamp": "2025-09-30 22:13:51.297398", + "step": 2725, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.352110", + "step": 2725, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020042883697897196, + "timestamp": "2025-09-30 22:13:51.355893", + "step": 2726, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.410769", + "step": 2726, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015756689244881272, + "timestamp": "2025-09-30 22:13:51.414151", + "step": 2727, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.472208", + "step": 2727, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008090141927823424, + "timestamp": "2025-09-30 22:13:51.480117", + "step": 2728, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.543520", + "step": 2728, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029392943251878023, + "timestamp": "2025-09-30 22:13:51.548126", + "step": 2729, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.613147", + "step": 2729, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013291776413097978, + "timestamp": "2025-09-30 22:13:51.616109", + "step": 2730, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.672856", + "step": 2730, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001892697880975902, + "timestamp": "2025-09-30 22:13:51.684947", + "step": 2731, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.755930", + "step": 2731, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0166336540132761, + "timestamp": "2025-09-30 22:13:51.769304", + "step": 2732, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.826448", + "step": 2732, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006099882302805781, + "timestamp": "2025-09-30 22:13:51.829660", + "step": 2733, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:51.889560", + "step": 2733, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00124065310228616, + "timestamp": "2025-09-30 22:13:51.892903", + "step": 2734, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:51.953941", + "step": 2734, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008868148550391197, + "timestamp": "2025-09-30 22:13:51.966422", + "step": 2735, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:52.031390", + "step": 2735, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031378235667943954, + "timestamp": "2025-09-30 22:13:52.038860", + "step": 2736, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:53.450275", + "step": 2736, + "epoch": 3 + }, + { + "type": "pplx", + "content": 27265120.538364556, + "timestamp": "2025-09-30 22:13:53.453548", + "step": 2736, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.506520", + "step": 2736, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004425609949976206, + "timestamp": "2025-09-30 22:13:53.510006", + "step": 2737, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.564344", + "step": 2737, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00230988091789186, + "timestamp": "2025-09-30 22:13:53.567973", + "step": 2738, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.629941", + "step": 2738, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012407422065734863, + "timestamp": "2025-09-30 22:13:53.632681", + "step": 2739, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.698644", + "step": 2739, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033368500880897045, + "timestamp": "2025-09-30 22:13:53.705763", + "step": 2740, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:53.761332", + "step": 2740, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001278606941923499, + "timestamp": "2025-09-30 22:13:53.764488", + "step": 2741, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.832321", + "step": 2741, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0054036402143538, + "timestamp": "2025-09-30 22:13:53.835475", + "step": 2742, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:53.891626", + "step": 2742, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02728707529604435, + "timestamp": "2025-09-30 22:13:53.895367", + "step": 2743, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:53.954983", + "step": 2743, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005847959313541651, + "timestamp": "2025-09-30 22:13:53.961864", + "step": 2744, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:54.025521", + "step": 2744, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006271410267800093, + "timestamp": "2025-09-30 22:13:54.028753", + "step": 2745, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:54.089180", + "step": 2745, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006004456430673599, + "timestamp": "2025-09-30 22:13:54.093167", + "step": 2746, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:54.169048", + "step": 2746, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019149092957377434, + "timestamp": "2025-09-30 22:13:54.171857", + "step": 2747, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 112 + ], + "flops": 2240013665728.0 + }, + "timestamp": "2025-09-30 22:13:54.246050", + "step": 2747, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032882175873965025, + "timestamp": "2025-09-30 22:13:54.264454", + "step": 2748, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:54.320406", + "step": 2748, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024987680837512016, + "timestamp": "2025-09-30 22:13:54.325559", + "step": 2749, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 128 + ], + "flops": 2560015608320.0 + }, + "timestamp": "2025-09-30 22:13:54.390676", + "step": 2749, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.043651703745126724, + "timestamp": "2025-09-30 22:13:54.394730", + "step": 2750, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 144 + ], + "flops": 2880017550912.0 + }, + "timestamp": "2025-09-30 22:13:54.466019", + "step": 2750, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03387341648340225, + "timestamp": "2025-09-30 22:13:54.468879", + "step": 2751, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 112 + ], + "batch_size": 8, + "flops": 2235680280448 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 80 + ], + "batch_size": 8, + "flops": 1596914505344 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 96 + ], + "batch_size": 8, + "flops": 1916297392896 + } + ], + "timestamp": "2025-09-30 22:13:55.951133", + "step": 2751, + "epoch": 3 + }, + { + "type": "pplx", + "content": 29126071.55555798, + "timestamp": "2025-09-30 22:13:55.953357", + "step": 2751, + "epoch": 3 + }, + { + "type": "best_pplx", + "content": 27226157.57321593, + "timestamp": "2025-09-30 22:13:55.955439", + "step": 2751, + "epoch": 3 + }, + { + "type": "best_step", + "content": 2679, + "timestamp": "2025-09-30 22:13:55.957537", + "step": 2751, + "epoch": 3 + }, + { + "type": "total_pplx_flops", + "content": 5062218940038400, + "timestamp": "2025-09-30 22:13:55.959569", + "step": 2751, + "epoch": 3 + }, + { + "type": "total_train_flops", + "content": 7174123736893632.0, + "timestamp": "2025-09-30 22:13:55.961809", + "step": 2751, + "epoch": 3 + } + ], + "best_evals": { + "pplx": { + "score": 27226157.57321593, + "step": 2679 + }, + "rougel": { + "precision": 0.8357843137254902, + "recall": 0.8357843137254902, + "fmeasure": 0.8357843137254902 + } + } +} \ No newline at end of file