{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 5e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1/runs/Sep30_22-09-37_gx10", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 57, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_lora_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": { "task_type": "CAUSAL_LM", "peft_type": "LORA", "auto_mapping": null, "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", "revision": null, "inference_mode": false, "r": 16, "target_modules": [ "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "q_proj" ], "exclude_modules": null, "lora_alpha": 16, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", "use_rslora": true, "modules_to_save": null, "init_lora_weights": true, "layers_to_transform": null, "layers_pattern": null, "rank_pattern": {}, "alpha_pattern": {}, "megatron_config": null, "megatron_core": "megatron.core", "trainable_token_indices": null, "loftq_config": {}, "eva_config": null, "corda_config": null, "use_dora": false, "alora_invocation_tokens": null, "use_qalora": false, "qalora_group_size": 16, "layer_replication": null, "runtime_config": { "ephemeral_gpu_offload": false }, "lora_bias": false, "target_parameters": null, "arrow_config": null }, "flops": { "eval": 5062218940038400, "train": 7174123736893632.0, "total": 1.2236342676932032e+16 }, "total": { "total": 67344.37337, "train": 48675.5105, "eval": 18668.862870000004 }, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:09:45.720522", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 226674977.87649825, "timestamp": "2025-09-30 22:09:45.725910", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:45.829654", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.7057779431343079, "timestamp": "2025-09-30 22:09:45.833808", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:45.927323", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.6982383131980896, "timestamp": "2025-09-30 22:09:45.931697", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:45.989452", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.7418850064277649, "timestamp": "2025-09-30 22:09:46.001612", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.069644", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.7169809341430664, "timestamp": "2025-09-30 22:09:46.122890", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.194754", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.5053693652153015, "timestamp": "2025-09-30 22:09:46.207488", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.291138", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.5166937708854675, "timestamp": "2025-09-30 22:09:46.303245", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:46.381366", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.5014775991439819, "timestamp": "2025-09-30 22:09:46.385931", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.454635", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.514745831489563, "timestamp": "2025-09-30 22:09:46.469695", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.539001", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.3292957842350006, "timestamp": "2025-09-30 22:09:46.546966", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.612140", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.3126233220100403, "timestamp": "2025-09-30 22:09:46.617552", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.675520", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.3234768509864807, "timestamp": "2025-09-30 22:09:46.690861", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.756152", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.3332277834415436, "timestamp": "2025-09-30 22:09:46.765624", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.823098", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.16887438297271729, "timestamp": "2025-09-30 22:09:46.828694", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.891179", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.15498413145542145, "timestamp": "2025-09-30 22:09:46.894434", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:46.961785", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.15807364881038666, "timestamp": "2025-09-30 22:09:46.965388", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.021211", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.16345378756523132, "timestamp": "2025-09-30 22:09:47.037888", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.092321", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.06315434724092484, "timestamp": "2025-09-30 22:09:47.095399", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.155095", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.07140235602855682, "timestamp": "2025-09-30 22:09:47.159326", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:47.217343", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.07542860507965088, "timestamp": "2025-09-30 22:09:47.231838", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.297426", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.07850368320941925, "timestamp": "2025-09-30 22:09:47.314640", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:47.380582", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.05402212589979172, "timestamp": "2025-09-30 22:09:47.393830", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.463154", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.04844330623745918, "timestamp": "2025-09-30 22:09:47.477962", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.538665", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.04047473892569542, "timestamp": "2025-09-30 22:09:47.542223", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.599055", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.03956778720021248, "timestamp": "2025-09-30 22:09:47.605916", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.666928", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.03282862529158592, "timestamp": "2025-09-30 22:09:47.680006", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:47.747238", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.047466158866882324, "timestamp": "2025-09-30 22:09:47.751128", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:47.809803", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.02898944355547428, "timestamp": "2025-09-30 22:09:47.813747", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.883244", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.02849601022899151, "timestamp": "2025-09-30 22:09:47.892740", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:47.948731", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.04645688086748123, "timestamp": "2025-09-30 22:09:47.957611", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.014118", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.02362995222210884, "timestamp": "2025-09-30 22:09:48.017619", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.092874", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.02474798448383808, "timestamp": "2025-09-30 22:09:48.098478", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.158112", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.025140417739748955, "timestamp": "2025-09-30 22:09:48.165508", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.235509", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.004965892527252436, "timestamp": "2025-09-30 22:09:48.249272", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.306709", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.03571665287017822, "timestamp": "2025-09-30 22:09:48.321337", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.396615", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.005427081603556871, "timestamp": "2025-09-30 22:09:48.400758", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.463184", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.022603018209338188, "timestamp": "2025-09-30 22:09:48.470249", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.526578", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.017443154007196426, "timestamp": "2025-09-30 22:09:48.530394", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.586995", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.023570295423269272, "timestamp": "2025-09-30 22:09:48.600109", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:48.662782", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.008299198001623154, "timestamp": "2025-09-30 22:09:48.666655", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.724407", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.023102333769202232, "timestamp": "2025-09-30 22:09:48.740457", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.806911", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.006827209610491991, "timestamp": "2025-09-30 22:09:48.810978", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:48.869752", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.02204090915620327, "timestamp": "2025-09-30 22:09:48.876097", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:48.935298", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.036014724522829056, "timestamp": "2025-09-30 22:09:48.938917", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:49.019588", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.005299022886902094, "timestamp": "2025-09-30 22:09:49.030945", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.092096", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.007433965802192688, "timestamp": "2025-09-30 22:09:49.095682", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.153079", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.007419214118272066, "timestamp": "2025-09-30 22:09:49.156971", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.226418", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.03738881275057793, "timestamp": "2025-09-30 22:09:49.231831", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.296369", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.021278386935591698, "timestamp": "2025-09-30 22:09:49.308245", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.369779", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.008051230572164059, "timestamp": "2025-09-30 22:09:49.382874", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:49.448881", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.03269955888390541, "timestamp": "2025-09-30 22:09:49.455582", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.521657", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.009572034701704979, "timestamp": "2025-09-30 22:09:49.535895", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.607418", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.03262675181031227, "timestamp": "2025-09-30 22:09:49.618732", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:49.698096", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.031087083742022514, "timestamp": "2025-09-30 22:09:49.702960", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:49.774471", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.028552474454045296, "timestamp": "2025-09-30 22:09:49.790307", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:49.858357", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.021799175068736076, "timestamp": "2025-09-30 22:09:49.866314", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:49.946732", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.03151797503232956, "timestamp": "2025-09-30 22:09:49.956360", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:50.025091", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.02139838971197605, "timestamp": "2025-09-30 22:09:50.036752", "step": 57, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:09:51.671705", "step": 57, "epoch": 1 }, { "type": "pplx", "content": 33459661.644647755, "timestamp": "2025-09-30 22:09:51.677492", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:51.732715", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.020825445652008057, "timestamp": "2025-09-30 22:09:51.736354", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:51.799744", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.019294634461402893, "timestamp": "2025-09-30 22:09:51.804206", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:51.863673", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.020074540749192238, "timestamp": "2025-09-30 22:09:51.879375", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:51.942712", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.019697507843375206, "timestamp": "2025-09-30 22:09:51.946783", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:52.013178", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.02132870815694332, "timestamp": "2025-09-30 22:09:52.017164", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:52.092840", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.01894804835319519, "timestamp": "2025-09-30 22:09:52.095762", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.174530", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.016838889569044113, "timestamp": "2025-09-30 22:09:52.182844", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.261938", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.01592377945780754, "timestamp": "2025-09-30 22:09:52.271881", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:52.339697", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.02463250793516636, "timestamp": "2025-09-30 22:09:52.343673", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.413263", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.018470000475645065, "timestamp": "2025-09-30 22:09:52.424819", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.493307", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.022894982248544693, "timestamp": "2025-09-30 22:09:52.507145", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:52.577240", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.029143383726477623, "timestamp": "2025-09-30 22:09:52.581726", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.649330", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.020210010930895805, "timestamp": "2025-09-30 22:09:52.652808", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.721839", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.011494003236293793, "timestamp": "2025-09-30 22:09:52.726534", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.789538", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.028703901916742325, "timestamp": "2025-09-30 22:09:52.796646", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:52.854779", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.018709659576416016, "timestamp": "2025-09-30 22:09:52.857531", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.929717", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.029944155365228653, "timestamp": "2025-09-30 22:09:52.933498", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:52.993304", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.020325303077697754, "timestamp": "2025-09-30 22:09:52.997865", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:53.069769", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.02151350863277912, "timestamp": "2025-09-30 22:09:53.078068", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.137966", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.0210565198212862, "timestamp": "2025-09-30 22:09:53.142397", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.203236", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.03604406490921974, "timestamp": "2025-09-30 22:09:53.217868", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.277099", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.036332327872514725, "timestamp": "2025-09-30 22:09:53.281235", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:53.342746", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.028764527291059494, "timestamp": "2025-09-30 22:09:53.351222", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.420316", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.019019123166799545, "timestamp": "2025-09-30 22:09:53.424641", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.480453", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.02148059569299221, "timestamp": "2025-09-30 22:09:53.486176", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.545094", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.04821930453181267, "timestamp": "2025-09-30 22:09:53.549757", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.621024", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.02800285816192627, "timestamp": "2025-09-30 22:09:53.628236", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.694859", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.0229884572327137, "timestamp": "2025-09-30 22:09:53.702009", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.769420", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.014066072180867195, "timestamp": "2025-09-30 22:09:53.773399", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.840322", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.017103631049394608, "timestamp": "2025-09-30 22:09:53.843455", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.903605", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.020594418048858643, "timestamp": "2025-09-30 22:09:53.919037", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:53.986826", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.020471200346946716, "timestamp": "2025-09-30 22:09:53.991020", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:54.054065", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.023065567016601562, "timestamp": "2025-09-30 22:09:54.059760", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.134584", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.026240045204758644, "timestamp": "2025-09-30 22:09:54.147631", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:54.215981", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.024411886930465698, "timestamp": "2025-09-30 22:09:54.233867", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.308215", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.022564176470041275, "timestamp": "2025-09-30 22:09:54.313481", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.376766", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.02625674568116665, "timestamp": "2025-09-30 22:09:54.381908", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.442608", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.026146892458200455, "timestamp": "2025-09-30 22:09:54.448685", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.517320", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.023254042491316795, "timestamp": "2025-09-30 22:09:54.525273", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.582927", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.021697642281651497, "timestamp": "2025-09-30 22:09:54.587639", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.648320", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.020044708624482155, "timestamp": "2025-09-30 22:09:54.660964", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.731315", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.019530335441231728, "timestamp": "2025-09-30 22:09:54.734833", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.794372", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.02076215110719204, "timestamp": "2025-09-30 22:09:54.803435", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.864805", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.018885493278503418, "timestamp": "2025-09-30 22:09:54.869711", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.928484", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.015139119699597359, "timestamp": "2025-09-30 22:09:54.932241", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:54.991083", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.020732766017317772, "timestamp": "2025-09-30 22:09:54.994243", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.058758", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.031727004796266556, "timestamp": "2025-09-30 22:09:55.066276", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.123302", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.03581748530268669, "timestamp": "2025-09-30 22:09:55.132399", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.190214", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.0061849248595535755, "timestamp": "2025-09-30 22:09:55.193819", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:55.257208", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.02108680084347725, "timestamp": "2025-09-30 22:09:55.263242", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:55.333196", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.054573871195316315, "timestamp": "2025-09-30 22:09:55.352127", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.419704", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.056397486478090286, "timestamp": "2025-09-30 22:09:55.423709", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:09:55.503381", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.045181386172771454, "timestamp": "2025-09-30 22:09:55.507646", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.565463", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.006864451337605715, "timestamp": "2025-09-30 22:09:55.570595", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.629518", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.035181932151317596, "timestamp": "2025-09-30 22:09:55.638791", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:55.698419", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.02644294500350952, "timestamp": "2025-09-30 22:09:55.702122", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:55.762278", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.024839377030730247, "timestamp": "2025-09-30 22:09:55.767722", "step": 114, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:09:57.404156", "step": 114, "epoch": 1 }, { "type": "pplx", "content": 34367165.57846854, "timestamp": "2025-09-30 22:09:57.417807", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:57.476760", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.009317861869931221, "timestamp": "2025-09-30 22:09:57.481123", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:57.549026", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.03304336592555046, "timestamp": "2025-09-30 22:09:57.556362", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:57.621741", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.02041228488087654, "timestamp": "2025-09-30 22:09:57.625489", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:57.707532", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.020021427422761917, "timestamp": "2025-09-30 22:09:57.711221", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:57.770352", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.01669391058385372, "timestamp": "2025-09-30 22:09:57.774354", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:57.833648", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.019399311393499374, "timestamp": "2025-09-30 22:09:57.840377", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:57.902145", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.027869191020727158, "timestamp": "2025-09-30 22:09:57.905157", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:57.963613", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.026405667886137962, "timestamp": "2025-09-30 22:09:57.967225", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:58.036235", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.025716735050082207, "timestamp": "2025-09-30 22:09:58.040067", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.128759", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.021928545087575912, "timestamp": "2025-09-30 22:09:58.136284", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.194040", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.024018412455916405, "timestamp": "2025-09-30 22:09:58.203869", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.275576", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.024753393605351448, "timestamp": "2025-09-30 22:09:58.279054", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:58.335015", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.025028575211763382, "timestamp": "2025-09-30 22:09:58.339229", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:58.409024", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.022491442039608955, "timestamp": "2025-09-30 22:09:58.415306", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.472633", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.021387087181210518, "timestamp": "2025-09-30 22:09:58.475769", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.544050", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.030490174889564514, "timestamp": "2025-09-30 22:09:58.547113", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.608056", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.021837439388036728, "timestamp": "2025-09-30 22:09:58.611804", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:58.672124", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.022853758186101913, "timestamp": "2025-09-30 22:09:58.679715", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:58.743010", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.02393500693142414, "timestamp": "2025-09-30 22:09:58.747907", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.812859", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.024709191173315048, "timestamp": "2025-09-30 22:09:58.817248", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:58.874630", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.02340940572321415, "timestamp": "2025-09-30 22:09:58.878363", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:58.955588", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.021506870165467262, "timestamp": "2025-09-30 22:09:58.971620", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:09:59.036619", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.01850222609937191, "timestamp": "2025-09-30 22:09:59.041465", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.114440", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.025246715173125267, "timestamp": "2025-09-30 22:09:59.119401", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.185394", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.019027644768357277, "timestamp": "2025-09-30 22:09:59.188937", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.254993", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.023021994158625603, "timestamp": "2025-09-30 22:09:59.268285", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.333791", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.018915778025984764, "timestamp": "2025-09-30 22:09:59.338389", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.397001", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.02081509307026863, "timestamp": "2025-09-30 22:09:59.400430", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.475598", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.02820817194879055, "timestamp": "2025-09-30 22:09:59.478597", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.539292", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.01714860461652279, "timestamp": "2025-09-30 22:09:59.545917", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.613640", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.01967170275747776, "timestamp": "2025-09-30 22:09:59.626738", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.695249", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.015452763997018337, "timestamp": "2025-09-30 22:09:59.698910", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.758042", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.031086495146155357, "timestamp": "2025-09-30 22:09:59.761287", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.832230", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.035771798342466354, "timestamp": "2025-09-30 22:09:59.838403", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:09:59.896383", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.052823007106781006, "timestamp": "2025-09-30 22:09:59.904646", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:09:59.974528", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.0519816055893898, "timestamp": "2025-09-30 22:09:59.979756", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.039600", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.020907681435346603, "timestamp": "2025-09-30 22:10:00.044125", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.110151", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.005571091081947088, "timestamp": "2025-09-30 22:10:00.122291", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:00.190990", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.006688456982374191, "timestamp": "2025-09-30 22:10:00.193396", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:00.257532", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.02993638627231121, "timestamp": "2025-09-30 22:10:00.260041", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.326910", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.03653806075453758, "timestamp": "2025-09-30 22:10:00.336903", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.399218", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.02178972400724888, "timestamp": "2025-09-30 22:10:00.406459", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.471221", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.03773200884461403, "timestamp": "2025-09-30 22:10:00.481225", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.544734", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.06024942919611931, "timestamp": "2025-09-30 22:10:00.555024", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:00.626258", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.051464516669511795, "timestamp": "2025-09-30 22:10:00.629304", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.695658", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.03535311296582222, "timestamp": "2025-09-30 22:10:00.705942", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.770799", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.021262094378471375, "timestamp": "2025-09-30 22:10:00.773120", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.836550", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.02948129177093506, "timestamp": "2025-09-30 22:10:00.840114", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:00.903882", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.020708870142698288, "timestamp": "2025-09-30 22:10:00.918926", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:00.987530", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.01856974884867668, "timestamp": "2025-09-30 22:10:01.003210", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:01.086270", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.02260742522776127, "timestamp": "2025-09-30 22:10:01.089350", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:01.159681", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.024135034531354904, "timestamp": "2025-09-30 22:10:01.167552", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:01.253829", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.024836549535393715, "timestamp": "2025-09-30 22:10:01.257725", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:01.317725", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.021925508975982666, "timestamp": "2025-09-30 22:10:01.324948", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:01.383629", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.026183176785707474, "timestamp": "2025-09-30 22:10:01.386896", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:01.445667", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.027825552970170975, "timestamp": "2025-09-30 22:10:01.448737", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:01.514985", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.02336815930902958, "timestamp": "2025-09-30 22:10:01.524465", "step": 171, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:03.122432", "step": 171, "epoch": 1 }, { "type": "pplx", "content": 30444627.67679443, "timestamp": "2025-09-30 22:10:03.130412", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:03.189713", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.025213051587343216, "timestamp": "2025-09-30 22:10:03.207338", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:03.270568", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.026924652978777885, "timestamp": "2025-09-30 22:10:03.289893", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:03.358458", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.019906342029571533, "timestamp": "2025-09-30 22:10:03.367731", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:03.431420", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.02854965254664421, "timestamp": "2025-09-30 22:10:03.435169", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:03.502168", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.029346134513616562, "timestamp": "2025-09-30 22:10:03.513567", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:03.569994", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.027899976819753647, "timestamp": "2025-09-30 22:10:03.595350", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:03.660257", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.031159192323684692, "timestamp": "2025-09-30 22:10:03.664061", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:03.758193", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.025665393099188805, "timestamp": "2025-09-30 22:10:03.762946", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:03.828469", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.02343871258199215, "timestamp": "2025-09-30 22:10:03.862207", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:03.934534", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.023453569039702415, "timestamp": "2025-09-30 22:10:03.941948", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.015301", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.025829827412962914, "timestamp": "2025-09-30 22:10:04.020573", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.091297", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.029110519215464592, "timestamp": "2025-09-30 22:10:04.102375", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:04.171598", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.02779129333794117, "timestamp": "2025-09-30 22:10:04.182335", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:04.244865", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.026309384033083916, "timestamp": "2025-09-30 22:10:04.248569", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.317008", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.020971303805708885, "timestamp": "2025-09-30 22:10:04.319635", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.389723", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.026080194860696793, "timestamp": "2025-09-30 22:10:04.398670", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.462204", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.022094666957855225, "timestamp": "2025-09-30 22:10:04.474646", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:04.546212", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.019831160083413124, "timestamp": "2025-09-30 22:10:04.548835", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.611573", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.02252844348549843, "timestamp": "2025-09-30 22:10:04.615230", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:04.698424", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.021682869642972946, "timestamp": "2025-09-30 22:10:04.700932", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:04.769139", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.02150973677635193, "timestamp": "2025-09-30 22:10:04.775344", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:04.834112", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.014492852613329887, "timestamp": "2025-09-30 22:10:04.843621", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:04.918930", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.015967510640621185, "timestamp": "2025-09-30 22:10:04.923925", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:04.980759", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.01770109124481678, "timestamp": "2025-09-30 22:10:04.994927", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.070096", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.02010829746723175, "timestamp": "2025-09-30 22:10:05.080898", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:05.151418", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.010940426029264927, "timestamp": "2025-09-30 22:10:05.165479", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:05.227588", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.02102290280163288, "timestamp": "2025-09-30 22:10:05.233173", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:05.295898", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.038564298301935196, "timestamp": "2025-09-30 22:10:05.299015", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.358240", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.030264217406511307, "timestamp": "2025-09-30 22:10:05.372402", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:05.436584", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.019798846915364265, "timestamp": "2025-09-30 22:10:05.440647", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.511249", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.02034393884241581, "timestamp": "2025-09-30 22:10:05.522972", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.588730", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.019814888015389442, "timestamp": "2025-09-30 22:10:05.599717", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:05.665164", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.01977935992181301, "timestamp": "2025-09-30 22:10:05.671120", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:05.733763", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.029726387932896614, "timestamp": "2025-09-30 22:10:05.736828", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.793001", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.008051145821809769, "timestamp": "2025-09-30 22:10:05.796198", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.855235", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.03421095758676529, "timestamp": "2025-09-30 22:10:05.859122", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.918769", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.01968538574874401, "timestamp": "2025-09-30 22:10:05.925875", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:05.982794", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.03160746023058891, "timestamp": "2025-09-30 22:10:05.988337", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.054850", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.04332786053419113, "timestamp": "2025-09-30 22:10:06.062819", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.139111", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.020400291308760643, "timestamp": "2025-09-30 22:10:06.146696", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.214028", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.05565622076392174, "timestamp": "2025-09-30 22:10:06.223281", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.280632", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.03217899426817894, "timestamp": "2025-09-30 22:10:06.287053", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.350273", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.01874413527548313, "timestamp": "2025-09-30 22:10:06.356717", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.422063", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.037103354930877686, "timestamp": "2025-09-30 22:10:06.429637", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.493219", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.030940750613808632, "timestamp": "2025-09-30 22:10:06.505199", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:06.578758", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.0197443924844265, "timestamp": "2025-09-30 22:10:06.585913", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:06.658187", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.020792236551642418, "timestamp": "2025-09-30 22:10:06.660699", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.717629", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.02086322009563446, "timestamp": "2025-09-30 22:10:06.720244", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.795016", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.01735932007431984, "timestamp": "2025-09-30 22:10:06.801454", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.859512", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.023645946756005287, "timestamp": "2025-09-30 22:10:06.862985", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.923013", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.026022594422101974, "timestamp": "2025-09-30 22:10:06.926308", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:06.984919", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.02121904492378235, "timestamp": "2025-09-30 22:10:06.991504", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:07.066529", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.027569929137825966, "timestamp": "2025-09-30 22:10:07.073649", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:07.131861", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.022572649642825127, "timestamp": "2025-09-30 22:10:07.139150", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:07.196403", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.02417564019560814, "timestamp": "2025-09-30 22:10:07.198523", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:07.255106", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.027327995747327805, "timestamp": "2025-09-30 22:10:07.260826", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:07.335925", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.018605349585413933, "timestamp": "2025-09-30 22:10:07.346966", "step": 228, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:08.896406", "step": 228, "epoch": 1 }, { "type": "pplx", "content": 30884345.685848907, "timestamp": "2025-09-30 22:10:08.897993", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:08.949580", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.021486392244696617, "timestamp": "2025-09-30 22:10:08.952193", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.007817", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.02346794866025448, "timestamp": "2025-09-30 22:10:09.010228", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.066408", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.022495094686746597, "timestamp": "2025-09-30 22:10:09.068495", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.130931", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.025627722963690758, "timestamp": "2025-09-30 22:10:09.136540", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:09.192596", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.025128453969955444, "timestamp": "2025-09-30 22:10:09.194667", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.257595", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.021439703181385994, "timestamp": "2025-09-30 22:10:09.262908", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.320130", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.025162285193800926, "timestamp": "2025-09-30 22:10:09.322981", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.379439", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.026900988072156906, "timestamp": "2025-09-30 22:10:09.386142", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:09.441059", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.024046484380960464, "timestamp": "2025-09-30 22:10:09.445390", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.516816", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.023616518825292587, "timestamp": "2025-09-30 22:10:09.520114", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.578308", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.025127580389380455, "timestamp": "2025-09-30 22:10:09.581575", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.651946", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.02287052944302559, "timestamp": "2025-09-30 22:10:09.659087", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:09.721058", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.022945618256926537, "timestamp": "2025-09-30 22:10:09.723823", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.785061", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.023390578106045723, "timestamp": "2025-09-30 22:10:09.791062", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:09.854408", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.022397786378860474, "timestamp": "2025-09-30 22:10:09.858275", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:09.915704", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.021386751905083656, "timestamp": "2025-09-30 22:10:09.923545", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.001700", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.02710045874118805, "timestamp": "2025-09-30 22:10:10.004521", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.076145", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.026470202952623367, "timestamp": "2025-09-30 22:10:10.083954", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:10.146486", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.025309713557362556, "timestamp": "2025-09-30 22:10:10.149331", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.207779", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.023809578269720078, "timestamp": "2025-09-30 22:10:10.218339", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.277901", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.028678497299551964, "timestamp": "2025-09-30 22:10:10.286335", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:10.367509", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.019650932401418686, "timestamp": "2025-09-30 22:10:10.370450", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.429746", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.025458762422204018, "timestamp": "2025-09-30 22:10:10.438017", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.495845", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.017796490341424942, "timestamp": "2025-09-30 22:10:10.505651", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:10.567499", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.015613814815878868, "timestamp": "2025-09-30 22:10:10.575269", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:10.640852", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.026363762095570564, "timestamp": "2025-09-30 22:10:10.648408", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:10.708360", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.018938321620225906, "timestamp": "2025-09-30 22:10:10.719034", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:10.779213", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.019102880731225014, "timestamp": "2025-09-30 22:10:10.792170", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.853523", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.015843430534005165, "timestamp": "2025-09-30 22:10:10.861130", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.919829", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.033805202692747116, "timestamp": "2025-09-30 22:10:10.930292", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:10.995113", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.024396615102887154, "timestamp": "2025-09-30 22:10:10.998228", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:11.058884", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.020021233707666397, "timestamp": "2025-09-30 22:10:11.070816", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.140519", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.02062203176319599, "timestamp": "2025-09-30 22:10:11.146244", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.216345", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.021955931559205055, "timestamp": "2025-09-30 22:10:11.218795", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:11.277173", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.021168585866689682, "timestamp": "2025-09-30 22:10:11.280607", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:11.337964", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.014190520159900188, "timestamp": "2025-09-30 22:10:11.344624", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.404989", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.021121378988027573, "timestamp": "2025-09-30 22:10:11.408576", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.466890", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.019114112481474876, "timestamp": "2025-09-30 22:10:11.470548", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:11.535659", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.025738313794136047, "timestamp": "2025-09-30 22:10:11.539012", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:11.606169", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.03802191838622093, "timestamp": "2025-09-30 22:10:11.612777", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:11.672111", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.018177002668380737, "timestamp": "2025-09-30 22:10:11.674706", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.747999", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.0222756527364254, "timestamp": "2025-09-30 22:10:11.752497", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.820640", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.02620212733745575, "timestamp": "2025-09-30 22:10:11.827843", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:11.890053", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.019811127334833145, "timestamp": "2025-09-30 22:10:11.902278", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:11.969423", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.03354780375957489, "timestamp": "2025-09-30 22:10:11.977684", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:12.043117", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.011064850725233555, "timestamp": "2025-09-30 22:10:12.052548", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.123721", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.02540605701506138, "timestamp": "2025-09-30 22:10:12.131500", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:12.199338", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.022669294849038124, "timestamp": "2025-09-30 22:10:12.206727", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.268274", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.010532691143453121, "timestamp": "2025-09-30 22:10:12.271962", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.339414", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.03192717209458351, "timestamp": "2025-09-30 22:10:12.346985", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:12.407191", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.027782125398516655, "timestamp": "2025-09-30 22:10:12.413865", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:12.469832", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.04358547180891037, "timestamp": "2025-09-30 22:10:12.484214", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.545511", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.018636440858244896, "timestamp": "2025-09-30 22:10:12.556165", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.620115", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.01732656918466091, "timestamp": "2025-09-30 22:10:12.623301", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.680350", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.021487489342689514, "timestamp": "2025-09-30 22:10:12.689464", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:12.750692", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.031494829803705215, "timestamp": "2025-09-30 22:10:12.757457", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:12.818583", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.02197718620300293, "timestamp": "2025-09-30 22:10:12.820809", "step": 285, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:14.137228", "step": 285, "epoch": 1 }, { "type": "pplx", "content": 32380318.740329083, "timestamp": "2025-09-30 22:10:14.140588", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.192593", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.012143162079155445, "timestamp": "2025-09-30 22:10:14.195045", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:14.249538", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.018916593864560127, "timestamp": "2025-09-30 22:10:14.252578", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:14.309892", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.03274720534682274, "timestamp": "2025-09-30 22:10:14.316172", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.369291", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.02828342653810978, "timestamp": "2025-09-30 22:10:14.373423", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.429706", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.013313318602740765, "timestamp": "2025-09-30 22:10:14.434084", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.497217", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.022141721099615097, "timestamp": "2025-09-30 22:10:14.499249", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:14.557870", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.026357505470514297, "timestamp": "2025-09-30 22:10:14.566973", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.621517", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.01601177267730236, "timestamp": "2025-09-30 22:10:14.631578", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.685483", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.028203275054693222, "timestamp": "2025-09-30 22:10:14.690289", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.748068", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.019105268642306328, "timestamp": "2025-09-30 22:10:14.751600", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:14.806342", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.016064850613474846, "timestamp": "2025-09-30 22:10:14.814490", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.870963", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.020204851403832436, "timestamp": "2025-09-30 22:10:14.872966", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:14.926632", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.026143083348870277, "timestamp": "2025-09-30 22:10:14.932947", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:14.991347", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.015985313802957535, "timestamp": "2025-09-30 22:10:14.997778", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:15.061141", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.01684342697262764, "timestamp": "2025-09-30 22:10:15.068483", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.124062", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.02533099241554737, "timestamp": "2025-09-30 22:10:15.128923", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:15.184753", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.015226058661937714, "timestamp": "2025-09-30 22:10:15.191351", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.249276", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.013342132791876793, "timestamp": "2025-09-30 22:10:15.253023", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.308744", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.01799563132226467, "timestamp": "2025-09-30 22:10:15.315259", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.371443", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.02207523211836815, "timestamp": "2025-09-30 22:10:15.374857", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.433040", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.026255745440721512, "timestamp": "2025-09-30 22:10:15.435725", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.497514", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.02717163972556591, "timestamp": "2025-09-30 22:10:15.501327", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.556699", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.018035126850008965, "timestamp": "2025-09-30 22:10:15.564100", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.619353", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.009531312622129917, "timestamp": "2025-09-30 22:10:15.622592", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.680727", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.030687615275382996, "timestamp": "2025-09-30 22:10:15.692879", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.752017", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.029424799606204033, "timestamp": "2025-09-30 22:10:15.754868", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.819561", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.019238030537962914, "timestamp": "2025-09-30 22:10:15.826004", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.886926", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.024589749053120613, "timestamp": "2025-09-30 22:10:15.894310", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:15.957840", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.017089612782001495, "timestamp": "2025-09-30 22:10:15.961295", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.027620", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.029858587309718132, "timestamp": "2025-09-30 22:10:16.030633", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.090178", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.008975454606115818, "timestamp": "2025-09-30 22:10:16.099786", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.157823", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.01512650866061449, "timestamp": "2025-09-30 22:10:16.160346", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:16.215635", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.027577145025134087, "timestamp": "2025-09-30 22:10:16.219071", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.274323", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.017553279176354408, "timestamp": "2025-09-30 22:10:16.277723", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:16.336932", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.019269872456789017, "timestamp": "2025-09-30 22:10:16.343333", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:16.397626", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.019443640485405922, "timestamp": "2025-09-30 22:10:16.405909", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.465354", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.03317169472575188, "timestamp": "2025-09-30 22:10:16.467887", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.528175", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.029915081337094307, "timestamp": "2025-09-30 22:10:16.532803", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:16.587604", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.02484137937426567, "timestamp": "2025-09-30 22:10:16.596854", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:16.662203", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.022063011303544044, "timestamp": "2025-09-30 22:10:16.665242", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.720220", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.0251996461302042, "timestamp": "2025-09-30 22:10:16.723926", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.778169", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.027159083634614944, "timestamp": "2025-09-30 22:10:16.781078", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:16.837381", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.02359570376574993, "timestamp": "2025-09-30 22:10:16.843453", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:16.897805", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.028713062405586243, "timestamp": "2025-09-30 22:10:16.900001", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:16.958749", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.028622686862945557, "timestamp": "2025-09-30 22:10:16.962010", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:17.019372", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.01968633010983467, "timestamp": "2025-09-30 22:10:17.022951", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:17.077163", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.01846577599644661, "timestamp": "2025-09-30 22:10:17.082746", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:17.136997", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.026781436055898666, "timestamp": "2025-09-30 22:10:17.139637", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:17.194116", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.021621640771627426, "timestamp": "2025-09-30 22:10:17.196713", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:17.249465", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.0181210245937109, "timestamp": "2025-09-30 22:10:17.255248", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:17.318124", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.019643476232886314, "timestamp": "2025-09-30 22:10:17.324116", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:17.377700", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.023817529901862144, "timestamp": "2025-09-30 22:10:17.380338", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:17.434506", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.017907124012708664, "timestamp": "2025-09-30 22:10:17.437861", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:17.490611", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.025304758921265602, "timestamp": "2025-09-30 22:10:17.494601", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:17.549116", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.01915588602423668, "timestamp": "2025-09-30 22:10:17.555595", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:17.609286", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.019800949841737747, "timestamp": "2025-09-30 22:10:17.616037", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:17.671375", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.027113808318972588, "timestamp": "2025-09-30 22:10:17.677090", "step": 342, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:18.928133", "step": 342, "epoch": 1 }, { "type": "pplx", "content": 33118764.612160176, "timestamp": "2025-09-30 22:10:18.930856", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:18.982725", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.021984072402119637, "timestamp": "2025-09-30 22:10:18.987387", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.044229", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.03714148327708244, "timestamp": "2025-09-30 22:10:19.050010", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.103528", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.02487654611468315, "timestamp": "2025-09-30 22:10:19.106751", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:19.161194", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.027115579694509506, "timestamp": "2025-09-30 22:10:19.164115", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:19.218126", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.021236712113022804, "timestamp": "2025-09-30 22:10:19.220655", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.273985", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.02278432808816433, "timestamp": "2025-09-30 22:10:19.279822", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:19.332898", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.018770869821310043, "timestamp": "2025-09-30 22:10:19.335724", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.390109", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.02387206256389618, "timestamp": "2025-09-30 22:10:19.393380", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.449148", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.029281629249453545, "timestamp": "2025-09-30 22:10:19.452340", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.505888", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.023951295763254166, "timestamp": "2025-09-30 22:10:19.512866", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.572066", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.033801205456256866, "timestamp": "2025-09-30 22:10:19.574004", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:19.628517", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.02057676389813423, "timestamp": "2025-09-30 22:10:19.630640", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.684004", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.024962907657027245, "timestamp": "2025-09-30 22:10:19.686449", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.739625", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.015609494410455227, "timestamp": "2025-09-30 22:10:19.745689", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:19.801726", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.02138134464621544, "timestamp": "2025-09-30 22:10:19.804873", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.857927", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.01888282783329487, "timestamp": "2025-09-30 22:10:19.860392", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:19.913884", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.01716557890176773, "timestamp": "2025-09-30 22:10:19.916491", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:19.970266", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.01593884639441967, "timestamp": "2025-09-30 22:10:19.976418", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:20.035102", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.024571670219302177, "timestamp": "2025-09-30 22:10:20.038125", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.092636", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.02865268848836422, "timestamp": "2025-09-30 22:10:20.094879", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:20.149660", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.020879996940493584, "timestamp": "2025-09-30 22:10:20.152442", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.207160", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.015153266489505768, "timestamp": "2025-09-30 22:10:20.213071", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.265866", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.026744915172457695, "timestamp": "2025-09-30 22:10:20.269002", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:20.323260", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.014960569329559803, "timestamp": "2025-09-30 22:10:20.326362", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:20.380718", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.019018033519387245, "timestamp": "2025-09-30 22:10:20.383960", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.438786", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.027187785133719444, "timestamp": "2025-09-30 22:10:20.444869", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.499318", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.02450934611260891, "timestamp": "2025-09-30 22:10:20.501389", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:20.561671", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.02450348250567913, "timestamp": "2025-09-30 22:10:20.564282", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.618016", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.013314715586602688, "timestamp": "2025-09-30 22:10:20.620695", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.681024", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.03389272093772888, "timestamp": "2025-09-30 22:10:20.686886", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:20.743579", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.023728221654891968, "timestamp": "2025-09-30 22:10:20.746212", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.805459", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.02945546805858612, "timestamp": "2025-09-30 22:10:20.808132", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.862539", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.016989445313811302, "timestamp": "2025-09-30 22:10:20.865660", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.919975", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.016743594780564308, "timestamp": "2025-09-30 22:10:20.925919", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:20.983291", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.012260585092008114, "timestamp": "2025-09-30 22:10:20.985700", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.041935", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.018505526706576347, "timestamp": "2025-09-30 22:10:21.044221", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.099361", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.02107120119035244, "timestamp": "2025-09-30 22:10:21.101514", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:21.159867", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.02227369323372841, "timestamp": "2025-09-30 22:10:21.165467", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.218242", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.025716906413435936, "timestamp": "2025-09-30 22:10:21.220708", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:21.278757", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.019525570794939995, "timestamp": "2025-09-30 22:10:21.280994", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.342716", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.012794404290616512, "timestamp": "2025-09-30 22:10:21.345127", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:21.401362", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.02354384958744049, "timestamp": "2025-09-30 22:10:21.406613", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:21.464824", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.010914976708590984, "timestamp": "2025-09-30 22:10:21.468655", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.522272", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.017349114641547203, "timestamp": "2025-09-30 22:10:21.524393", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.583430", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.018263814970850945, "timestamp": "2025-09-30 22:10:21.586006", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.649324", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.019998768344521523, "timestamp": "2025-09-30 22:10:21.654921", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:21.715235", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.016233079135417938, "timestamp": "2025-09-30 22:10:21.717707", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.775916", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.02938160113990307, "timestamp": "2025-09-30 22:10:21.778133", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.833207", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.03876896947622299, "timestamp": "2025-09-30 22:10:21.835192", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.891248", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.013861365616321564, "timestamp": "2025-09-30 22:10:21.897422", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:21.953983", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.01384772453457117, "timestamp": "2025-09-30 22:10:21.956099", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:22.011307", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.020624876022338867, "timestamp": "2025-09-30 22:10:22.013443", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:22.067347", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.00986799132078886, "timestamp": "2025-09-30 22:10:22.069504", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:22.125853", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.023464640602469444, "timestamp": "2025-09-30 22:10:22.131474", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:22.190813", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.009329124353826046, "timestamp": "2025-09-30 22:10:22.193032", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:22.251437", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.013422129675745964, "timestamp": "2025-09-30 22:10:22.253601", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:22.313231", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.023139040917158127, "timestamp": "2025-09-30 22:10:22.315268", "step": 399, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:23.660296", "step": 399, "epoch": 1 }, { "type": "pplx", "content": 40589601.62274881, "timestamp": "2025-09-30 22:10:23.662816", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:23.715616", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.01724778302013874, "timestamp": "2025-09-30 22:10:23.721952", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:23.777851", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.008374908939003944, "timestamp": "2025-09-30 22:10:23.780509", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:23.835680", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.01957196369767189, "timestamp": "2025-09-30 22:10:23.838371", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:23.894866", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.018934715539216995, "timestamp": "2025-09-30 22:10:23.897082", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:23.953152", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.03373418375849724, "timestamp": "2025-09-30 22:10:23.962739", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:24.018580", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.027509469538927078, "timestamp": "2025-09-30 22:10:24.022584", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:24.078777", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.03421385958790779, "timestamp": "2025-09-30 22:10:24.081776", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.138442", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.011767718009650707, "timestamp": "2025-09-30 22:10:24.140810", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:24.197396", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.009523920714855194, "timestamp": "2025-09-30 22:10:24.203005", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.258085", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.023020844906568527, "timestamp": "2025-09-30 22:10:24.261501", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.321430", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.020535219460725784, "timestamp": "2025-09-30 22:10:24.324678", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:24.384770", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.01701906882226467, "timestamp": "2025-09-30 22:10:24.391280", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:24.449399", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.03201786056160927, "timestamp": "2025-09-30 22:10:24.457737", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.512109", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.011379254050552845, "timestamp": "2025-09-30 22:10:24.515992", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.584178", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.03204244002699852, "timestamp": "2025-09-30 22:10:24.589115", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.645428", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.035000745207071304, "timestamp": "2025-09-30 22:10:24.648528", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.708676", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.011228829622268677, "timestamp": "2025-09-30 22:10:24.715697", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.774441", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.015902062878012657, "timestamp": "2025-09-30 22:10:24.778489", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.837328", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.029384631663560867, "timestamp": "2025-09-30 22:10:24.839269", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:24.893599", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.022646890953183174, "timestamp": "2025-09-30 22:10:24.897402", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:24.953933", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.015414858236908913, "timestamp": "2025-09-30 22:10:24.960738", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.018909", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.015387741848826408, "timestamp": "2025-09-30 22:10:25.021339", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:25.076031", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.016175920143723488, "timestamp": "2025-09-30 22:10:25.078361", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:25.140468", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.020924212411046028, "timestamp": "2025-09-30 22:10:25.143111", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:25.198775", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.015957873314619064, "timestamp": "2025-09-30 22:10:25.204587", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:25.257981", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.018746186047792435, "timestamp": "2025-09-30 22:10:25.260168", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:25.317623", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.028188351541757584, "timestamp": "2025-09-30 22:10:25.319833", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.374649", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.015842631459236145, "timestamp": "2025-09-30 22:10:25.377221", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.436214", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.016134170815348625, "timestamp": "2025-09-30 22:10:25.442167", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.498915", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.040546927601099014, "timestamp": "2025-09-30 22:10:25.500908", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:25.561078", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.016925180330872536, "timestamp": "2025-09-30 22:10:25.565189", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.620787", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.01041901670396328, "timestamp": "2025-09-30 22:10:25.627804", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:25.688574", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.01679927296936512, "timestamp": "2025-09-30 22:10:25.694911", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.754989", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.01948891021311283, "timestamp": "2025-09-30 22:10:25.758013", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:25.813575", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.026692839339375496, "timestamp": "2025-09-30 22:10:25.820015", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.876179", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.021725250408053398, "timestamp": "2025-09-30 22:10:25.880139", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:25.934999", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.016403162851929665, "timestamp": "2025-09-30 22:10:25.953512", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.011910", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.02581772208213806, "timestamp": "2025-09-30 22:10:26.019119", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.077043", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.023395583033561707, "timestamp": "2025-09-30 22:10:26.081776", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.138996", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.03953873738646507, "timestamp": "2025-09-30 22:10:26.142696", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.198275", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.018297594040632248, "timestamp": "2025-09-30 22:10:26.206228", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.262605", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.030052199959754944, "timestamp": "2025-09-30 22:10:26.266573", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.326102", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.029010707512497902, "timestamp": "2025-09-30 22:10:26.329939", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:26.392124", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.013197534717619419, "timestamp": "2025-09-30 22:10:26.395865", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:26.455099", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.03683071583509445, "timestamp": "2025-09-30 22:10:26.462646", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.518023", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.01791354827582836, "timestamp": "2025-09-30 22:10:26.522558", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.580204", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.01698167435824871, "timestamp": "2025-09-30 22:10:26.584622", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:26.644908", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.012840881943702698, "timestamp": "2025-09-30 22:10:26.649589", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.706363", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.021327754482626915, "timestamp": "2025-09-30 22:10:26.712053", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:26.770017", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.012066961266100407, "timestamp": "2025-09-30 22:10:26.775318", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.837781", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.028330016881227493, "timestamp": "2025-09-30 22:10:26.846404", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.909229", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.023282703012228012, "timestamp": "2025-09-30 22:10:26.912219", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:26.966278", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.010071339085698128, "timestamp": "2025-09-30 22:10:26.976341", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:27.043117", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.01836954988539219, "timestamp": "2025-09-30 22:10:27.046716", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:27.101355", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.025107625871896744, "timestamp": "2025-09-30 22:10:27.103861", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:27.166366", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.02213280089199543, "timestamp": "2025-09-30 22:10:27.172278", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:27.230762", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.012315492145717144, "timestamp": "2025-09-30 22:10:27.239089", "step": 456, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:28.975277", "step": 456, "epoch": 1 }, { "type": "pplx", "content": 43492757.85152588, "timestamp": "2025-09-30 22:10:28.977567", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.039128", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.02141198329627514, "timestamp": "2025-09-30 22:10:29.048866", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:29.110780", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.0336587019264698, "timestamp": "2025-09-30 22:10:29.115494", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.173645", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.014187236316502094, "timestamp": "2025-09-30 22:10:29.180707", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:29.240772", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.026401251554489136, "timestamp": "2025-09-30 22:10:29.247468", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.310695", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.020034367218613625, "timestamp": "2025-09-30 22:10:29.317981", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.376268", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.011318235658109188, "timestamp": "2025-09-30 22:10:29.382341", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.443870", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.009131813421845436, "timestamp": "2025-09-30 22:10:29.452675", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.513953", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.020363813266158104, "timestamp": "2025-09-30 22:10:29.520672", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.585071", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.019311266019940376, "timestamp": "2025-09-30 22:10:29.587763", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.642703", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.01495366357266903, "timestamp": "2025-09-30 22:10:29.645362", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.709787", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.023592934012413025, "timestamp": "2025-09-30 22:10:29.715529", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.776645", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.027076885104179382, "timestamp": "2025-09-30 22:10:29.785682", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.843777", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.02357049658894539, "timestamp": "2025-09-30 22:10:29.849824", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.915063", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.02975117228925228, "timestamp": "2025-09-30 22:10:29.917443", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:29.974919", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.0073095522820949554, "timestamp": "2025-09-30 22:10:29.981202", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.042258", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.01751861535012722, "timestamp": "2025-09-30 22:10:30.048005", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:30.104524", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.017867306247353554, "timestamp": "2025-09-30 22:10:30.107760", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:30.163190", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.018450839444994926, "timestamp": "2025-09-30 22:10:30.165609", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:30.243448", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.011997430585324764, "timestamp": "2025-09-30 22:10:30.245751", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.303755", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.012514320202171803, "timestamp": "2025-09-30 22:10:30.310204", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.369301", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.017777670174837112, "timestamp": "2025-09-30 22:10:30.371917", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.429023", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.01758970133960247, "timestamp": "2025-09-30 22:10:30.435678", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.492913", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.01861104555428028, "timestamp": "2025-09-30 22:10:30.495572", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.558831", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.012014762498438358, "timestamp": "2025-09-30 22:10:30.565157", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.623237", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.022567814216017723, "timestamp": "2025-09-30 22:10:30.632032", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.696343", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.02221139892935753, "timestamp": "2025-09-30 22:10:30.700014", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:30.755962", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.014111381955444813, "timestamp": "2025-09-30 22:10:30.767462", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.829136", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.01386257540434599, "timestamp": "2025-09-30 22:10:30.844391", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:30.905039", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.019835516810417175, "timestamp": "2025-09-30 22:10:30.914402", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:30.979295", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.015690794214606285, "timestamp": "2025-09-30 22:10:30.984650", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:31.042843", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.028532350435853004, "timestamp": "2025-09-30 22:10:31.045845", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.101781", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.024972129613161087, "timestamp": "2025-09-30 22:10:31.112277", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.171834", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.018831370398402214, "timestamp": "2025-09-30 22:10:31.174381", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:31.230777", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.014749753288924694, "timestamp": "2025-09-30 22:10:31.233694", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.289017", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.013099046424031258, "timestamp": "2025-09-30 22:10:31.298818", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:31.370536", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.009160849265754223, "timestamp": "2025-09-30 22:10:31.377012", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.442153", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.016461042687296867, "timestamp": "2025-09-30 22:10:31.451114", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.512076", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.043794337660074234, "timestamp": "2025-09-30 22:10:31.515396", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.573231", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.022364290431141853, "timestamp": "2025-09-30 22:10:31.582922", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.640558", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.021926935762166977, "timestamp": "2025-09-30 22:10:31.647384", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:31.707818", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.00889673549681902, "timestamp": "2025-09-30 22:10:31.709679", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:31.796642", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.01593521051108837, "timestamp": "2025-09-30 22:10:31.799398", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.855056", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.01839728094637394, "timestamp": "2025-09-30 22:10:31.872085", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:31.931769", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.010415417142212391, "timestamp": "2025-09-30 22:10:31.944103", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-09-30 22:10:32.385970", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:32.457155", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.009094706736505032, "timestamp": "2025-09-30 22:10:32.460338", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:32.530121", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.021521523594856262, "timestamp": "2025-09-30 22:10:32.534049", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:32.597198", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.01767658442258835, "timestamp": "2025-09-30 22:10:32.600463", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:32.662752", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.01816992275416851, "timestamp": "2025-09-30 22:10:32.670715", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:32.744440", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.005588170140981674, "timestamp": "2025-09-30 22:10:32.747345", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:32.802870", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.04537493363022804, "timestamp": "2025-09-30 22:10:32.805853", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:32.886649", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.028734682127833366, "timestamp": "2025-09-30 22:10:32.891721", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:32.961030", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.030345702543854713, "timestamp": "2025-09-30 22:10:32.973246", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:33.039773", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.008057617582380772, "timestamp": "2025-09-30 22:10:33.042933", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:33.100278", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.013696548528969288, "timestamp": "2025-09-30 22:10:33.102771", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:33.162158", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.024826964363455772, "timestamp": "2025-09-30 22:10:33.166225", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:33.224101", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.00507523724809289, "timestamp": "2025-09-30 22:10:33.231387", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:33.287069", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.019475262612104416, "timestamp": "2025-09-30 22:10:33.292055", "step": 513, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:34.768428", "step": 513, "epoch": 1 }, { "type": "pplx", "content": 46595227.32665869, "timestamp": "2025-09-30 22:10:34.775585", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:34.833990", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.007841977290809155, "timestamp": "2025-09-30 22:10:34.845351", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:34.908910", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.02670980803668499, "timestamp": "2025-09-30 22:10:34.911739", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:34.971584", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.017184732481837273, "timestamp": "2025-09-30 22:10:34.984326", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:35.047781", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.0331471748650074, "timestamp": "2025-09-30 22:10:35.055702", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.112052", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.026939330622553825, "timestamp": "2025-09-30 22:10:35.114808", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.171714", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.02031738869845867, "timestamp": "2025-09-30 22:10:35.181994", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:35.247523", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.009875715710222721, "timestamp": "2025-09-30 22:10:35.260487", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:35.326048", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.022956620901823044, "timestamp": "2025-09-30 22:10:35.329588", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.405846", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.02863324247300625, "timestamp": "2025-09-30 22:10:35.415191", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.480290", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.010198934003710747, "timestamp": "2025-09-30 22:10:35.488332", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:35.554304", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.006696059834212065, "timestamp": "2025-09-30 22:10:35.561717", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.625076", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.014501352794468403, "timestamp": "2025-09-30 22:10:35.629654", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.686952", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.024316150695085526, "timestamp": "2025-09-30 22:10:35.689892", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:35.746425", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.014118111692368984, "timestamp": "2025-09-30 22:10:35.750028", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.817322", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.013456260785460472, "timestamp": "2025-09-30 22:10:35.829823", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.887982", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.010260426439344883, "timestamp": "2025-09-30 22:10:35.895282", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:35.958752", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.01042400486767292, "timestamp": "2025-09-30 22:10:35.962803", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.028568", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.027442054823040962, "timestamp": "2025-09-30 22:10:36.031254", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.092057", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.022459326311945915, "timestamp": "2025-09-30 22:10:36.104947", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.168237", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.02310291863977909, "timestamp": "2025-09-30 22:10:36.171118", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.227533", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.02492346242070198, "timestamp": "2025-09-30 22:10:36.233507", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.294215", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.01682530902326107, "timestamp": "2025-09-30 22:10:36.299767", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.361267", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.007360723335295916, "timestamp": "2025-09-30 22:10:36.367897", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.423199", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.03802620619535446, "timestamp": "2025-09-30 22:10:36.426126", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.486427", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.014312833547592163, "timestamp": "2025-09-30 22:10:36.491959", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.552805", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.04087325558066368, "timestamp": "2025-09-30 22:10:36.559559", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.628565", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.015957888215780258, "timestamp": "2025-09-30 22:10:36.638319", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.697802", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.02199072763323784, "timestamp": "2025-09-30 22:10:36.707124", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.770624", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.018141640350222588, "timestamp": "2025-09-30 22:10:36.774492", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:36.840242", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.011592322029173374, "timestamp": "2025-09-30 22:10:36.843581", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.909312", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.023553509265184402, "timestamp": "2025-09-30 22:10:36.920894", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:36.983571", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.025153178721666336, "timestamp": "2025-09-30 22:10:36.990509", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.055055", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.024465525522828102, "timestamp": "2025-09-30 22:10:37.061105", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.122633", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.036670148372650146, "timestamp": "2025-09-30 22:10:37.129338", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:37.190024", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.022738128900527954, "timestamp": "2025-09-30 22:10:37.197170", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.258684", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.03440006449818611, "timestamp": "2025-09-30 22:10:37.265618", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.323252", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.030233899131417274, "timestamp": "2025-09-30 22:10:37.326686", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.395601", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.021326003596186638, "timestamp": "2025-09-30 22:10:37.399142", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:37.460428", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.03364909067749977, "timestamp": "2025-09-30 22:10:37.470963", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.532526", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.027066290378570557, "timestamp": "2025-09-30 22:10:37.538400", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:37.605612", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.02852269820868969, "timestamp": "2025-09-30 22:10:37.615557", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:37.696930", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.009644846431910992, "timestamp": "2025-09-30 22:10:37.706231", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:37.782232", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.024163635447621346, "timestamp": "2025-09-30 22:10:37.791433", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:37.848943", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.017797913402318954, "timestamp": "2025-09-30 22:10:37.851761", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:37.911890", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.015675710514187813, "timestamp": "2025-09-30 22:10:37.914217", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:37.983298", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.017431657761335373, "timestamp": "2025-09-30 22:10:37.987332", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:38.065562", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.011734207160770893, "timestamp": "2025-09-30 22:10:38.072800", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.128930", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.016993921250104904, "timestamp": "2025-09-30 22:10:38.131406", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.190303", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.019728850573301315, "timestamp": "2025-09-30 22:10:38.193454", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.261817", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.02789020538330078, "timestamp": "2025-09-30 22:10:38.274576", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.332263", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.030427774414420128, "timestamp": "2025-09-30 22:10:38.339142", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:38.396018", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.02213878743350506, "timestamp": "2025-09-30 22:10:38.409641", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.472316", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.019610082730650902, "timestamp": "2025-09-30 22:10:38.474674", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.533261", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.01741156354546547, "timestamp": "2025-09-30 22:10:38.540039", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:38.601991", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.022262096405029297, "timestamp": "2025-09-30 22:10:38.609836", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.667090", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.024254482239484787, "timestamp": "2025-09-30 22:10:38.671827", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:38.733470", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.017775481566786766, "timestamp": "2025-09-30 22:10:38.736235", "step": 570, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:40.258822", "step": 570, "epoch": 1 }, { "type": "pplx", "content": 47270191.72182987, "timestamp": "2025-09-30 22:10:40.261088", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.317993", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.012138426303863525, "timestamp": "2025-09-30 22:10:40.320234", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.376160", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.011941500008106232, "timestamp": "2025-09-30 22:10:40.382939", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:40.458446", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.045111771672964096, "timestamp": "2025-09-30 22:10:40.468771", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.533176", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.026581604033708572, "timestamp": "2025-09-30 22:10:40.535322", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.592110", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.017682049423456192, "timestamp": "2025-09-30 22:10:40.594331", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.650481", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.010170449502766132, "timestamp": "2025-09-30 22:10:40.657729", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:40.715792", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.02111336775124073, "timestamp": "2025-09-30 22:10:40.718679", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:40.782680", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.026711730286478996, "timestamp": "2025-09-30 22:10:40.787449", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.854518", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.0173348356038332, "timestamp": "2025-09-30 22:10:40.857175", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:40.915291", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.015113255940377712, "timestamp": "2025-09-30 22:10:40.926331", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:40.990132", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.014600790105760098, "timestamp": "2025-09-30 22:10:40.997747", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.073398", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.007143207360059023, "timestamp": "2025-09-30 22:10:41.076344", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:41.138532", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.020687399432063103, "timestamp": "2025-09-30 22:10:41.141022", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:41.201736", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.01433294266462326, "timestamp": "2025-09-30 22:10:41.213194", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.273627", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.016348710283637047, "timestamp": "2025-09-30 22:10:41.280786", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.342328", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.0035052443854510784, "timestamp": "2025-09-30 22:10:41.347937", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.414579", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.002743300748988986, "timestamp": "2025-09-30 22:10:41.416873", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:41.472859", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.047513365745544434, "timestamp": "2025-09-30 22:10:41.480545", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.546176", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.040245767682790756, "timestamp": "2025-09-30 22:10:41.549512", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.609742", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.02579033002257347, "timestamp": "2025-09-30 22:10:41.612831", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-30 22:10:41.696774", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.0405263788998127, "timestamp": "2025-09-30 22:10:41.700437", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.757185", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.021822741255164146, "timestamp": "2025-09-30 22:10:41.764096", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.820886", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.027860483154654503, "timestamp": "2025-09-30 22:10:41.828222", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:41.885937", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.004523556213825941, "timestamp": "2025-09-30 22:10:41.894777", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:41.958205", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.02394627407193184, "timestamp": "2025-09-30 22:10:41.962740", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.021860", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.038400568068027496, "timestamp": "2025-09-30 22:10:42.029446", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:42.091560", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.020109396427869797, "timestamp": "2025-09-30 22:10:42.094174", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.165711", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.01921306923031807, "timestamp": "2025-09-30 22:10:42.174038", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:42.237330", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.035011257976293564, "timestamp": "2025-09-30 22:10:42.240295", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.298368", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.022402983158826828, "timestamp": "2025-09-30 22:10:42.306110", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.369255", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.023800494149327278, "timestamp": "2025-09-30 22:10:42.372114", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.428097", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.01819646917283535, "timestamp": "2025-09-30 22:10:42.431036", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.490355", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.02224593423306942, "timestamp": "2025-09-30 22:10:42.497290", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.558386", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.0203517097979784, "timestamp": "2025-09-30 22:10:42.564695", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.627576", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.019523965194821358, "timestamp": "2025-09-30 22:10:42.631531", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:42.688275", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.027153076604008675, "timestamp": "2025-09-30 22:10:42.690498", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.755120", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.02785666286945343, "timestamp": "2025-09-30 22:10:42.757460", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:42.815434", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.021869728341698647, "timestamp": "2025-09-30 22:10:42.821966", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:42.884795", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.018342459574341774, "timestamp": "2025-09-30 22:10:42.888293", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:42.944945", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.028766410425305367, "timestamp": "2025-09-30 22:10:42.947387", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.005852", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.018140679225325584, "timestamp": "2025-09-30 22:10:43.009253", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.067906", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.045255787670612335, "timestamp": "2025-09-30 22:10:43.074222", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.128319", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.0265056025236845, "timestamp": "2025-09-30 22:10:43.132320", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.198083", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.01379645336419344, "timestamp": "2025-09-30 22:10:43.203353", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.263000", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.027471302077174187, "timestamp": "2025-09-30 22:10:43.269960", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.333110", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.024327319115400314, "timestamp": "2025-09-30 22:10:43.346979", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:43.409256", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.03087471053004265, "timestamp": "2025-09-30 22:10:43.412560", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.470023", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.01593298651278019, "timestamp": "2025-09-30 22:10:43.472721", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.535159", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.025733623653650284, "timestamp": "2025-09-30 22:10:43.542048", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.600543", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.014233306981623173, "timestamp": "2025-09-30 22:10:43.610569", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:43.668833", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.016759194433689117, "timestamp": "2025-09-30 22:10:43.673266", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:43.732562", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.017879139631986618, "timestamp": "2025-09-30 22:10:43.734884", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.790516", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.015650030225515366, "timestamp": "2025-09-30 22:10:43.793825", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.854359", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.03968540579080582, "timestamp": "2025-09-30 22:10:43.860624", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.915520", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.02014957368373871, "timestamp": "2025-09-30 22:10:43.919115", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:43.983591", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.013079083524644375, "timestamp": "2025-09-30 22:10:43.986020", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:44.049977", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.008377542719244957, "timestamp": "2025-09-30 22:10:44.052699", "step": 627, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:45.478375", "step": 627, "epoch": 1 }, { "type": "pplx", "content": 36910385.758393474, "timestamp": "2025-09-30 22:10:45.481360", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.537084", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.00950475875288248, "timestamp": "2025-09-30 22:10:45.543599", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.607483", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.04431208595633507, "timestamp": "2025-09-30 22:10:45.615877", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.695439", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.03406307473778725, "timestamp": "2025-09-30 22:10:45.699071", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.761839", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.020670806989073753, "timestamp": "2025-09-30 22:10:45.764255", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:45.826547", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.01893521286547184, "timestamp": "2025-09-30 22:10:45.833509", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.888363", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.004280415363609791, "timestamp": "2025-09-30 22:10:45.892234", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:45.959896", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.011709028854966164, "timestamp": "2025-09-30 22:10:45.962642", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.031711", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.027526134625077248, "timestamp": "2025-09-30 22:10:46.034267", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.103348", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.00565726263448596, "timestamp": "2025-09-30 22:10:46.109439", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.168041", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.003934341017156839, "timestamp": "2025-09-30 22:10:46.170929", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.255593", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.04277295991778374, "timestamp": "2025-09-30 22:10:46.258249", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.328094", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.03677859529852867, "timestamp": "2025-09-30 22:10:46.331919", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.388579", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.003129987744614482, "timestamp": "2025-09-30 22:10:46.395424", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.456612", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.006956647150218487, "timestamp": "2025-09-30 22:10:46.458826", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.522331", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.016979986801743507, "timestamp": "2025-09-30 22:10:46.524475", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.599315", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.0076353359036147594, "timestamp": "2025-09-30 22:10:46.602275", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.662323", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.0387401208281517, "timestamp": "2025-09-30 22:10:46.668774", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.750641", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.011696984991431236, "timestamp": "2025-09-30 22:10:46.752924", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:46.809448", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.03578943759202957, "timestamp": "2025-09-30 22:10:46.812025", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.883057", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.007673552725464106, "timestamp": "2025-09-30 22:10:46.885598", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:46.947206", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.0077004688791930676, "timestamp": "2025-09-30 22:10:46.954082", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:47.017350", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.009061289019882679, "timestamp": "2025-09-30 22:10:47.021631", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:47.077628", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.008708625100553036, "timestamp": "2025-09-30 22:10:47.080916", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.135851", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.02237722836434841, "timestamp": "2025-09-30 22:10:47.138706", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.193613", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.015081758610904217, "timestamp": "2025-09-30 22:10:47.204993", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.261802", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.020020443946123123, "timestamp": "2025-09-30 22:10:47.264706", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.325696", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.029474765062332153, "timestamp": "2025-09-30 22:10:47.335406", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:47.396809", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.0116732781752944, "timestamp": "2025-09-30 22:10:47.399639", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.458257", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.007577078882604837, "timestamp": "2025-09-30 22:10:47.464479", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.526732", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.0052547636441886425, "timestamp": "2025-09-30 22:10:47.529806", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:47.586913", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.018849587067961693, "timestamp": "2025-09-30 22:10:47.593218", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:47.660670", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.009038467891514301, "timestamp": "2025-09-30 22:10:47.665059", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.724782", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.01595907285809517, "timestamp": "2025-09-30 22:10:47.732513", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.789691", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.013368090614676476, "timestamp": "2025-09-30 22:10:47.791930", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.850005", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.006896801292896271, "timestamp": "2025-09-30 22:10:47.854407", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:47.916187", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.01652236096560955, "timestamp": "2025-09-30 22:10:47.918980", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:47.979786", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.024208780378103256, "timestamp": "2025-09-30 22:10:47.995948", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.049264", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.01214011013507843, "timestamp": "2025-09-30 22:10:48.051480", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.105936", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.00862074550241232, "timestamp": "2025-09-30 22:10:48.109173", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.163901", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.015050886198878288, "timestamp": "2025-09-30 22:10:48.167571", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.222532", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.013227096758782864, "timestamp": "2025-09-30 22:10:48.228725", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.283259", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.038987185806035995, "timestamp": "2025-09-30 22:10:48.288928", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:48.351701", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.04726878181099892, "timestamp": "2025-09-30 22:10:48.354165", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.410143", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.008983231149613857, "timestamp": "2025-09-30 22:10:48.413801", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.468268", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.018243545666337013, "timestamp": "2025-09-30 22:10:48.475102", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.531770", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.02222895435988903, "timestamp": "2025-09-30 22:10:48.536427", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.596231", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.02621746063232422, "timestamp": "2025-09-30 22:10:48.598374", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.653154", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.04202299192547798, "timestamp": "2025-09-30 22:10:48.656988", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.716543", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.014642206020653248, "timestamp": "2025-09-30 22:10:48.724600", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.784864", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.020012324675917625, "timestamp": "2025-09-30 22:10:48.787118", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:48.846088", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.00840475969016552, "timestamp": "2025-09-30 22:10:48.848230", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.902428", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.007752344012260437, "timestamp": "2025-09-30 22:10:48.909039", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:48.967559", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.013558789156377316, "timestamp": "2025-09-30 22:10:48.975134", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:49.029934", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.010377148166298866, "timestamp": "2025-09-30 22:10:49.044282", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:49.099360", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.012168629094958305, "timestamp": "2025-09-30 22:10:49.101658", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:49.166872", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.015606013126671314, "timestamp": "2025-09-30 22:10:49.169829", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:49.234287", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.009783417917788029, "timestamp": "2025-09-30 22:10:49.251208", "step": 684, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:50.679333", "step": 684, "epoch": 1 }, { "type": "pplx", "content": 32421580.472615503, "timestamp": "2025-09-30 22:10:50.682440", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:50.737433", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.030063265934586525, "timestamp": "2025-09-30 22:10:50.740440", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:50.803382", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.01967502571642399, "timestamp": "2025-09-30 22:10:50.806394", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:50.862605", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.008366209454834461, "timestamp": "2025-09-30 22:10:50.865456", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:50.922320", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.010231166146695614, "timestamp": "2025-09-30 22:10:50.928995", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:50.983225", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.027960339561104774, "timestamp": "2025-09-30 22:10:50.985402", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.048020", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.008330179378390312, "timestamp": "2025-09-30 22:10:51.052493", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.112208", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.01055544801056385, "timestamp": "2025-09-30 22:10:51.115301", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:51.173114", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.010108939372003078, "timestamp": "2025-09-30 22:10:51.179615", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.235667", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.021281301975250244, "timestamp": "2025-09-30 22:10:51.239421", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:51.294223", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.05024373531341553, "timestamp": "2025-09-30 22:10:51.296468", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.351440", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.005921120289713144, "timestamp": "2025-09-30 22:10:51.354235", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:51.409886", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.02190905436873436, "timestamp": "2025-09-30 22:10:51.415774", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.473821", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.012462352402508259, "timestamp": "2025-09-30 22:10:51.476265", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.532028", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.009990805760025978, "timestamp": "2025-09-30 22:10:51.534677", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.589732", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.023069290444254875, "timestamp": "2025-09-30 22:10:51.596286", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.650913", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.025465210899710655, "timestamp": "2025-09-30 22:10:51.657683", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:51.729574", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.007522939704358578, "timestamp": "2025-09-30 22:10:51.731746", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:51.786127", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.023840337991714478, "timestamp": "2025-09-30 22:10:51.788879", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:51.843789", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.01778830774128437, "timestamp": "2025-09-30 22:10:51.860122", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.917114", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.013482254929840565, "timestamp": "2025-09-30 22:10:51.923966", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:51.978573", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.018839532509446144, "timestamp": "2025-09-30 22:10:51.980911", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:52.038927", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.006275936495512724, "timestamp": "2025-09-30 22:10:52.041640", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.096729", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.006527402438223362, "timestamp": "2025-09-30 22:10:52.099087", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.152269", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.016934264451265335, "timestamp": "2025-09-30 22:10:52.158593", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.217087", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.013991935178637505, "timestamp": "2025-09-30 22:10:52.220158", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.273558", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.043482519686222076, "timestamp": "2025-09-30 22:10:52.277808", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:52.335518", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.02595517970621586, "timestamp": "2025-09-30 22:10:52.338420", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.398550", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.03393742814660072, "timestamp": "2025-09-30 22:10:52.404498", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.467147", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.019353583455085754, "timestamp": "2025-09-30 22:10:52.470659", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.529899", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.01128054317086935, "timestamp": "2025-09-30 22:10:52.535951", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:52.590776", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.021564552560448647, "timestamp": "2025-09-30 22:10:52.595225", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.655656", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.03504403680562973, "timestamp": "2025-09-30 22:10:52.663444", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.719328", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.0178972315043211, "timestamp": "2025-09-30 22:10:52.723225", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.784679", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.019097017124295235, "timestamp": "2025-09-30 22:10:52.791581", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.847779", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.01555024366825819, "timestamp": "2025-09-30 22:10:52.851233", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:52.905574", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.03621254116296768, "timestamp": "2025-09-30 22:10:52.912518", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:52.965224", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.00840581115335226, "timestamp": "2025-09-30 22:10:52.968506", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.026689", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.007782516535371542, "timestamp": "2025-09-30 22:10:53.030738", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.085295", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.012187975458800793, "timestamp": "2025-09-30 22:10:53.089215", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.144371", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.0311259888112545, "timestamp": "2025-09-30 22:10:53.150171", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.212857", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.030853962525725365, "timestamp": "2025-09-30 22:10:53.216394", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:53.272679", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.02167673408985138, "timestamp": "2025-09-30 22:10:53.275493", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:53.332586", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.014911350794136524, "timestamp": "2025-09-30 22:10:53.338789", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.401157", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.009337635710835457, "timestamp": "2025-09-30 22:10:53.413064", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.469760", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.007847541943192482, "timestamp": "2025-09-30 22:10:53.473222", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.536806", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.021578481420874596, "timestamp": "2025-09-30 22:10:53.540382", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.601369", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.009829229675233364, "timestamp": "2025-09-30 22:10:53.604142", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.675425", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.022942425683140755, "timestamp": "2025-09-30 22:10:53.683426", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:53.740581", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.018000300973653793, "timestamp": "2025-09-30 22:10:53.743633", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:53.802944", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.01805448904633522, "timestamp": "2025-09-30 22:10:53.806262", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:53.865277", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.01143564097583294, "timestamp": "2025-09-30 22:10:53.868349", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:53.929590", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.010203097946941853, "timestamp": "2025-09-30 22:10:53.935468", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:54.001180", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.028915002942085266, "timestamp": "2025-09-30 22:10:54.003750", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:54.064463", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.01114210207015276, "timestamp": "2025-09-30 22:10:54.067247", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:54.121329", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.02133123390376568, "timestamp": "2025-09-30 22:10:54.124058", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:54.189872", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.034789033234119415, "timestamp": "2025-09-30 22:10:54.196789", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:54.264175", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.029856473207473755, "timestamp": "2025-09-30 22:10:54.266805", "step": 741, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:10:55.578341", "step": 741, "epoch": 1 }, { "type": "pplx", "content": 32540290.98788411, "timestamp": "2025-09-30 22:10:55.580487", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:55.634594", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.011041047051548958, "timestamp": "2025-09-30 22:10:55.637911", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:55.693181", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.013807429000735283, "timestamp": "2025-09-30 22:10:55.695078", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:55.751626", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.03291318565607071, "timestamp": "2025-09-30 22:10:55.757522", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:55.812977", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.017772836610674858, "timestamp": "2025-09-30 22:10:55.817366", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:55.870542", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.010666130110621452, "timestamp": "2025-09-30 22:10:55.872940", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:55.927591", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.008871505968272686, "timestamp": "2025-09-30 22:10:55.929995", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:55.983417", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.02815295197069645, "timestamp": "2025-09-30 22:10:55.989362", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.043054", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.018090158700942993, "timestamp": "2025-09-30 22:10:56.045234", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:56.098937", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.012871643528342247, "timestamp": "2025-09-30 22:10:56.101068", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:56.155911", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.02255532518029213, "timestamp": "2025-09-30 22:10:56.167119", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.225904", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.03662119060754776, "timestamp": "2025-09-30 22:10:56.231510", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.285709", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.01848854124546051, "timestamp": "2025-09-30 22:10:56.289641", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:56.347354", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.010717155411839485, "timestamp": "2025-09-30 22:10:56.355255", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:56.422808", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.014511531218886375, "timestamp": "2025-09-30 22:10:56.424913", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:56.482154", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.026509271934628487, "timestamp": "2025-09-30 22:10:56.489952", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.549793", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.004727398511022329, "timestamp": "2025-09-30 22:10:56.553813", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:56.613752", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.0046147494576871395, "timestamp": "2025-09-30 22:10:56.620038", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:56.678010", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.009979399852454662, "timestamp": "2025-09-30 22:10:56.680933", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.742069", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.0126791438087821, "timestamp": "2025-09-30 22:10:56.750293", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.814648", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.005177278071641922, "timestamp": "2025-09-30 22:10:56.817288", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:10:56.881049", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.009649130515754223, "timestamp": "2025-09-30 22:10:56.886005", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:56.949170", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.036273036152124405, "timestamp": "2025-09-30 22:10:56.957677", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.012209", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.009934181347489357, "timestamp": "2025-09-30 22:10:57.020192", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.081575", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.009199343621730804, "timestamp": "2025-09-30 22:10:57.095822", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.156959", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.013879785314202309, "timestamp": "2025-09-30 22:10:57.159436", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.212583", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.00856445636600256, "timestamp": "2025-09-30 22:10:57.216284", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.276051", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.04576727747917175, "timestamp": "2025-09-30 22:10:57.284101", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.339006", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.01549856923520565, "timestamp": "2025-09-30 22:10:57.347060", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.406865", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.035134777426719666, "timestamp": "2025-09-30 22:10:57.425285", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:57.479947", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.027558207511901855, "timestamp": "2025-09-30 22:10:57.500451", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:57.572535", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.008749599568545818, "timestamp": "2025-09-30 22:10:57.578703", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.632785", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.0587838776409626, "timestamp": "2025-09-30 22:10:57.634782", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.702651", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.004769704304635525, "timestamp": "2025-09-30 22:10:57.704741", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:57.762540", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.030832087621092796, "timestamp": "2025-09-30 22:10:57.764624", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:57.818884", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.02731013298034668, "timestamp": "2025-09-30 22:10:57.824632", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:57.877914", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.007288725581020117, "timestamp": "2025-09-30 22:10:57.880273", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:57.936165", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.003071850398555398, "timestamp": "2025-09-30 22:10:57.938334", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:57.997184", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.02605491690337658, "timestamp": "2025-09-30 22:10:57.999595", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.055531", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.006960700731724501, "timestamp": "2025-09-30 22:10:58.061276", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.121422", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.018633553758263588, "timestamp": "2025-09-30 22:10:58.124436", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:58.186438", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.0156040508300066, "timestamp": "2025-09-30 22:10:58.188684", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.241872", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.024231048300862312, "timestamp": "2025-09-30 22:10:58.244128", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:58.298950", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.01897078938782215, "timestamp": "2025-09-30 22:10:58.304930", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.362591", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.007195747457444668, "timestamp": "2025-09-30 22:10:58.364851", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:58.419197", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.004759868141263723, "timestamp": "2025-09-30 22:10:58.421346", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.475595", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.006611085496842861, "timestamp": "2025-09-30 22:10:58.477615", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.538663", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.024320388212800026, "timestamp": "2025-09-30 22:10:58.544699", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.604428", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.01196072157472372, "timestamp": "2025-09-30 22:10:58.606592", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:10:58.664400", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.00526924803853035, "timestamp": "2025-09-30 22:10:58.666685", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.721426", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.013081463053822517, "timestamp": "2025-09-30 22:10:58.724083", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.780350", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.021804455667734146, "timestamp": "2025-09-30 22:10:58.786275", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.839769", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.014370249584317207, "timestamp": "2025-09-30 22:10:58.843237", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.897409", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.03542889282107353, "timestamp": "2025-09-30 22:10:58.899546", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:58.957528", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.014196035452187061, "timestamp": "2025-09-30 22:10:58.959878", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:59.013285", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.006897695828229189, "timestamp": "2025-09-30 22:10:59.019364", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:10:59.072892", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.009900706820189953, "timestamp": "2025-09-30 22:10:59.078591", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:10:59.133963", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.017876118421554565, "timestamp": "2025-09-30 22:10:59.136126", "step": 798, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:00.461430", "step": 798, "epoch": 1 }, { "type": "pplx", "content": 35240375.419794545, "timestamp": "2025-09-30 22:11:00.463470", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.520434", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.0068840510211884975, "timestamp": "2025-09-30 22:11:00.522737", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.575833", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.032674796879291534, "timestamp": "2025-09-30 22:11:00.583553", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.636895", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.026194969192147255, "timestamp": "2025-09-30 22:11:00.640345", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:00.694348", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.029077323153614998, "timestamp": "2025-09-30 22:11:00.696603", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.751803", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.021133294329047203, "timestamp": "2025-09-30 22:11:00.754628", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.810255", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.008854121901094913, "timestamp": "2025-09-30 22:11:00.816284", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.882520", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.025976279750466347, "timestamp": "2025-09-30 22:11:00.885485", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.940450", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.02907080017030239, "timestamp": "2025-09-30 22:11:00.942441", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:00.995813", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.009363925084471703, "timestamp": "2025-09-30 22:11:00.998106", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.051881", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.017092768102884293, "timestamp": "2025-09-30 22:11:01.057512", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:01.111311", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.00928075797855854, "timestamp": "2025-09-30 22:11:01.113383", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:01.168876", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.015542738139629364, "timestamp": "2025-09-30 22:11:01.171649", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.227333", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.022915521636605263, "timestamp": "2025-09-30 22:11:01.229426", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.282472", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.012876084074378014, "timestamp": "2025-09-30 22:11:01.288399", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.342096", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.0221566129475832, "timestamp": "2025-09-30 22:11:01.344402", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.397809", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.017164286226034164, "timestamp": "2025-09-30 22:11:01.399891", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.453987", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.009646688587963581, "timestamp": "2025-09-30 22:11:01.457594", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.513635", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.015178002417087555, "timestamp": "2025-09-30 22:11:01.522945", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.577810", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.030943872407078743, "timestamp": "2025-09-30 22:11:01.580407", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.635707", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.016377057880163193, "timestamp": "2025-09-30 22:11:01.638511", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.692527", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.007752551231533289, "timestamp": "2025-09-30 22:11:01.694955", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:01.748220", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.011008880101144314, "timestamp": "2025-09-30 22:11:01.754611", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:01.808426", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.011034859344363213, "timestamp": "2025-09-30 22:11:01.813052", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.869500", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.01996428705751896, "timestamp": "2025-09-30 22:11:01.873235", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:01.935142", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.017574485391378403, "timestamp": "2025-09-30 22:11:01.944753", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:02.005799", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.024993544444441795, "timestamp": "2025-09-30 22:11:02.012099", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:02.066524", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.010080697014927864, "timestamp": "2025-09-30 22:11:02.072252", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.126597", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.015512553974986076, "timestamp": "2025-09-30 22:11:02.129066", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:02.186535", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.028046919032931328, "timestamp": "2025-09-30 22:11:02.192835", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:02.258556", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.01140694972127676, "timestamp": "2025-09-30 22:11:02.264406", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.325633", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.01551117654889822, "timestamp": "2025-09-30 22:11:02.330248", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.383542", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.005076105706393719, "timestamp": "2025-09-30 22:11:02.390134", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.447396", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.007691748905926943, "timestamp": "2025-09-30 22:11:02.464235", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.521544", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.008671620860695839, "timestamp": "2025-09-30 22:11:02.535575", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.590330", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.031672995537519455, "timestamp": "2025-09-30 22:11:02.594947", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:02.650326", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.01112611498683691, "timestamp": "2025-09-30 22:11:02.662307", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:02.727931", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.0036083075683563948, "timestamp": "2025-09-30 22:11:02.731299", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.789283", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.008130187168717384, "timestamp": "2025-09-30 22:11:02.799273", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:02.857425", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.036892350763082504, "timestamp": "2025-09-30 22:11:02.860090", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.922647", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.018781444057822227, "timestamp": "2025-09-30 22:11:02.931661", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:02.986358", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.029954804107546806, "timestamp": "2025-09-30 22:11:02.988486", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:03.047723", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.01475707720965147, "timestamp": "2025-09-30 22:11:03.054607", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.117898", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.01812085136771202, "timestamp": "2025-09-30 22:11:03.120429", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.174268", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.030853111296892166, "timestamp": "2025-09-30 22:11:03.176617", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.235615", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.014583474956452847, "timestamp": "2025-09-30 22:11:03.237574", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.290976", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.020900966599583626, "timestamp": "2025-09-30 22:11:03.301990", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:03.356268", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.0058554792776703835, "timestamp": "2025-09-30 22:11:03.358429", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:03.413993", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.001921547343954444, "timestamp": "2025-09-30 22:11:03.416165", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.470304", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.033496271818876266, "timestamp": "2025-09-30 22:11:03.472297", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:03.525170", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.01942761428654194, "timestamp": "2025-09-30 22:11:03.530403", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:03.586709", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.003972503822296858, "timestamp": "2025-09-30 22:11:03.588508", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.646023", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.00408882787451148, "timestamp": "2025-09-30 22:11:03.648241", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.706513", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.00527701573446393, "timestamp": "2025-09-30 22:11:03.708413", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:03.762487", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.016006560996174812, "timestamp": "2025-09-30 22:11:03.768331", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.821876", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.006566441617906094, "timestamp": "2025-09-30 22:11:03.823974", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:03.879460", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.02836592122912407, "timestamp": "2025-09-30 22:11:03.881726", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:03.937448", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.006195141933858395, "timestamp": "2025-09-30 22:11:03.939682", "step": 855, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:05.130762", "step": 855, "epoch": 1 }, { "type": "pplx", "content": 37193476.96670681, "timestamp": "2025-09-30 22:11:05.132796", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.183818", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.005694164428859949, "timestamp": "2025-09-30 22:11:05.189422", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.241745", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.012527307495474815, "timestamp": "2025-09-30 22:11:05.248245", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:05.301596", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.010429566726088524, "timestamp": "2025-09-30 22:11:05.303733", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.357377", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.01283800695091486, "timestamp": "2025-09-30 22:11:05.359112", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.412159", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.027922047302126884, "timestamp": "2025-09-30 22:11:05.417257", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.470079", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.019196191802620888, "timestamp": "2025-09-30 22:11:05.471898", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:05.525280", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.006581494119018316, "timestamp": "2025-09-30 22:11:05.529503", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.585152", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.019029339775443077, "timestamp": "2025-09-30 22:11:05.589783", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.645567", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.013398184441030025, "timestamp": "2025-09-30 22:11:05.651621", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.705265", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.02754964306950569, "timestamp": "2025-09-30 22:11:05.708449", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:05.778912", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.006636460777372122, "timestamp": "2025-09-30 22:11:05.785334", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.846711", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.0036353315226733685, "timestamp": "2025-09-30 22:11:05.849779", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:05.904919", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.011577283963561058, "timestamp": "2025-09-30 22:11:05.912851", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:05.976102", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.016597332432866096, "timestamp": "2025-09-30 22:11:05.980116", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.039350", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.044374849647283554, "timestamp": "2025-09-30 22:11:06.043036", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.101559", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.005777058191597462, "timestamp": "2025-09-30 22:11:06.105594", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.171476", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.000960107718128711, "timestamp": "2025-09-30 22:11:06.179702", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.249129", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.018397826701402664, "timestamp": "2025-09-30 22:11:06.254591", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.311920", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.009636670351028442, "timestamp": "2025-09-30 22:11:06.314872", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.372349", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.02451557293534279, "timestamp": "2025-09-30 22:11:06.376520", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.451044", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.036388419568538666, "timestamp": "2025-09-30 22:11:06.458139", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:06.511108", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.00405623484402895, "timestamp": "2025-09-30 22:11:06.514565", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.585352", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.00108911597635597, "timestamp": "2025-09-30 22:11:06.589256", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.647797", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.040293216705322266, "timestamp": "2025-09-30 22:11:06.650671", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:06.705666", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.053030043840408325, "timestamp": "2025-09-30 22:11:06.712051", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.777617", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.029138272628188133, "timestamp": "2025-09-30 22:11:06.780854", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.835173", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.04902893677353859, "timestamp": "2025-09-30 22:11:06.841768", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:06.908278", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.03171558305621147, "timestamp": "2025-09-30 22:11:06.910742", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:06.973977", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.0021508908830583096, "timestamp": "2025-09-30 22:11:06.979781", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.038739", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.005451333709061146, "timestamp": "2025-09-30 22:11:07.043018", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.103661", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.021850435063242912, "timestamp": "2025-09-30 22:11:07.118706", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:07.182709", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.007379130460321903, "timestamp": "2025-09-30 22:11:07.196464", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.256055", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.024785201996564865, "timestamp": "2025-09-30 22:11:07.262829", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.316559", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.018535610288381577, "timestamp": "2025-09-30 22:11:07.319236", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.373407", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.03353133052587509, "timestamp": "2025-09-30 22:11:07.379519", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.439250", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.018443452194333076, "timestamp": "2025-09-30 22:11:07.442180", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:07.496055", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.018285350874066353, "timestamp": "2025-09-30 22:11:07.503954", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:07.557501", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.012829815968871117, "timestamp": "2025-09-30 22:11:07.567651", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.630050", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.05475155636668205, "timestamp": "2025-09-30 22:11:07.641354", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.705736", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.021429507061839104, "timestamp": "2025-09-30 22:11:07.720924", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.786174", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.02315603196620941, "timestamp": "2025-09-30 22:11:07.796568", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:07.850106", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.03501684218645096, "timestamp": "2025-09-30 22:11:07.863817", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:07.919325", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.013003984466195107, "timestamp": "2025-09-30 22:11:07.923166", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:07.980445", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.010267031379044056, "timestamp": "2025-09-30 22:11:07.984239", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:08.038645", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.018157100304961205, "timestamp": "2025-09-30 22:11:08.049287", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.104784", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.029190553352236748, "timestamp": "2025-09-30 22:11:08.114701", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.172726", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.026753077283501625, "timestamp": "2025-09-30 22:11:08.179576", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.240865", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.01073368452489376, "timestamp": "2025-09-30 22:11:08.248813", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.309720", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.03276833891868591, "timestamp": "2025-09-30 22:11:08.323380", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.380159", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.01617128774523735, "timestamp": "2025-09-30 22:11:08.395340", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.456218", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.010833910666406155, "timestamp": "2025-09-30 22:11:08.460300", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:08.515914", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.027199676260352135, "timestamp": "2025-09-30 22:11:08.519480", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.576222", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.018916381523013115, "timestamp": "2025-09-30 22:11:08.583916", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:08.641627", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.02062804065644741, "timestamp": "2025-09-30 22:11:08.644997", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:08.706192", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.0058442666195333, "timestamp": "2025-09-30 22:11:08.709737", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:08.766142", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.01607462391257286, "timestamp": "2025-09-30 22:11:08.778255", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:08.836438", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.007725600618869066, "timestamp": "2025-09-30 22:11:08.842851", "step": 912, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:10.078308", "step": 912, "epoch": 1 }, { "type": "pplx", "content": 32940359.600703914, "timestamp": "2025-09-30 22:11:10.090732", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.153675", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.03332886844873428, "timestamp": "2025-09-30 22:11:10.157740", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:10.214129", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.013211602345108986, "timestamp": "2025-09-30 22:11:10.218989", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.272896", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.009566955268383026, "timestamp": "2025-09-30 22:11:10.284016", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.348896", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.008288740180432796, "timestamp": "2025-09-30 22:11:10.356530", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:10.411350", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.028552187606692314, "timestamp": "2025-09-30 22:11:10.421464", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:10.483632", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.04483124613761902, "timestamp": "2025-09-30 22:11:10.487570", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.541429", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.019391506910324097, "timestamp": "2025-09-30 22:11:10.557400", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.611653", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.054323118180036545, "timestamp": "2025-09-30 22:11:10.629999", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.691250", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.02535811997950077, "timestamp": "2025-09-30 22:11:10.698154", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.752424", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.04979207366704941, "timestamp": "2025-09-30 22:11:10.756092", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.811296", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.02441842295229435, "timestamp": "2025-09-30 22:11:10.814036", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:10.867564", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.04180010035634041, "timestamp": "2025-09-30 22:11:10.874041", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.927346", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.01361654233187437, "timestamp": "2025-09-30 22:11:10.937917", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:10.993247", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.003717446932569146, "timestamp": "2025-09-30 22:11:10.997274", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.051890", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.03469868749380112, "timestamp": "2025-09-30 22:11:11.055014", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.109765", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.022886861115694046, "timestamp": "2025-09-30 22:11:11.116739", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.176655", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.005226781126111746, "timestamp": "2025-09-30 22:11:11.179549", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.233548", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.017794528976082802, "timestamp": "2025-09-30 22:11:11.237401", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.291906", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.034500379115343094, "timestamp": "2025-09-30 22:11:11.294980", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.349397", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.030988583341240883, "timestamp": "2025-09-30 22:11:11.355998", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.410976", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.017014091834425926, "timestamp": "2025-09-30 22:11:11.414367", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.468837", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.010941265150904655, "timestamp": "2025-09-30 22:11:11.479844", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.536379", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.01286663394421339, "timestamp": "2025-09-30 22:11:11.539828", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:11.595693", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.015257543884217739, "timestamp": "2025-09-30 22:11:11.602568", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.656340", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.021351372823119164, "timestamp": "2025-09-30 22:11:11.659197", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:11.727179", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.02581597864627838, "timestamp": "2025-09-30 22:11:11.731138", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.786742", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.026747452095150948, "timestamp": "2025-09-30 22:11:11.790349", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.845472", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.017408985644578934, "timestamp": "2025-09-30 22:11:11.851936", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.905233", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.023534327745437622, "timestamp": "2025-09-30 22:11:11.907889", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:11.963045", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.018282432109117508, "timestamp": "2025-09-30 22:11:11.968359", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:12.025145", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.016469817608594894, "timestamp": "2025-09-30 22:11:12.029379", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:12.085252", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.014783737249672413, "timestamp": "2025-09-30 22:11:12.099785", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.160940", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.024034133180975914, "timestamp": "2025-09-30 22:11:12.164325", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.217753", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.021934911608695984, "timestamp": "2025-09-30 22:11:12.220707", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.280206", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.020119894295930862, "timestamp": "2025-09-30 22:11:12.283544", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.337534", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.023923667147755623, "timestamp": "2025-09-30 22:11:12.343895", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.398463", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.022407446056604385, "timestamp": "2025-09-30 22:11:12.402721", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.456921", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.013444255106151104, "timestamp": "2025-09-30 22:11:12.467226", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.530025", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.027343014255166054, "timestamp": "2025-09-30 22:11:12.542380", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.595965", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.01585647463798523, "timestamp": "2025-09-30 22:11:12.612721", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.666604", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.021314824000000954, "timestamp": "2025-09-30 22:11:12.669789", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.723426", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.025206713005900383, "timestamp": "2025-09-30 22:11:12.738101", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.791302", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.019110916182398796, "timestamp": "2025-09-30 22:11:12.794999", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:12.850074", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.017940467223525047, "timestamp": "2025-09-30 22:11:12.857144", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.912948", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.01818234659731388, "timestamp": "2025-09-30 22:11:12.916307", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:12.971346", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.0146263986825943, "timestamp": "2025-09-30 22:11:12.975228", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:13.028830", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.01784416474401951, "timestamp": "2025-09-30 22:11:13.032715", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.087234", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.01846710965037346, "timestamp": "2025-09-30 22:11:13.100827", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:13.162630", "step": 960, "epoch": 2 }, { "type": "loss", "content": 0.006946962792426348, "timestamp": "2025-09-30 22:11:13.165285", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.223612", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.013810682110488415, "timestamp": "2025-09-30 22:11:13.227015", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.281240", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.011714011430740356, "timestamp": "2025-09-30 22:11:13.284618", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.338804", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.01336191687732935, "timestamp": "2025-09-30 22:11:13.344647", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.399856", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.0091180969029665, "timestamp": "2025-09-30 22:11:13.407761", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.462252", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.006548265926539898, "timestamp": "2025-09-30 22:11:13.472964", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:13.528676", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.02103179506957531, "timestamp": "2025-09-30 22:11:13.531427", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.585497", "step": 967, "epoch": 2 }, { "type": "loss", "content": 0.010586266405880451, "timestamp": "2025-09-30 22:11:13.598192", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:13.651417", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.03246624767780304, "timestamp": "2025-09-30 22:11:13.654241", "step": 969, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:14.864683", "step": 969, "epoch": 2 }, { "type": "pplx", "content": 28798242.8768259, "timestamp": "2025-09-30 22:11:14.874517", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:14.929301", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.02270360477268696, "timestamp": "2025-09-30 22:11:14.932975", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:14.989521", "step": 970, "epoch": 2 }, { "type": "loss", "content": 0.023568162694573402, "timestamp": "2025-09-30 22:11:14.993038", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.047908", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.014281951822340488, "timestamp": "2025-09-30 22:11:15.056516", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.115587", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.02344512939453125, "timestamp": "2025-09-30 22:11:15.118054", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.171272", "step": 973, "epoch": 2 }, { "type": "loss", "content": 0.015097950585186481, "timestamp": "2025-09-30 22:11:15.180781", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.246952", "step": 974, "epoch": 2 }, { "type": "loss", "content": 0.015720214694738388, "timestamp": "2025-09-30 22:11:15.250015", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.307234", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.017806116491556168, "timestamp": "2025-09-30 22:11:15.320373", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.379570", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.028563665226101875, "timestamp": "2025-09-30 22:11:15.383120", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.437607", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.01088719256222248, "timestamp": "2025-09-30 22:11:15.449621", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.509702", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.01909536123275757, "timestamp": "2025-09-30 22:11:15.523252", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.588137", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.0036833793856203556, "timestamp": "2025-09-30 22:11:15.594879", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.648809", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.0032408982515335083, "timestamp": "2025-09-30 22:11:15.651580", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.713924", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.0031015234999358654, "timestamp": "2025-09-30 22:11:15.727931", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.785197", "step": 982, "epoch": 2 }, { "type": "loss", "content": 0.03271304816007614, "timestamp": "2025-09-30 22:11:15.788925", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.844036", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.004379452671855688, "timestamp": "2025-09-30 22:11:15.851440", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:15.905298", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.027334626764059067, "timestamp": "2025-09-30 22:11:15.909656", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:15.969612", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.03557107597589493, "timestamp": "2025-09-30 22:11:15.973229", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.028026", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.02836051769554615, "timestamp": "2025-09-30 22:11:16.030794", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.086426", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.0021814878564327955, "timestamp": "2025-09-30 22:11:16.100933", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.162657", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.045109979808330536, "timestamp": "2025-09-30 22:11:16.165248", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:16.221746", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.008987918496131897, "timestamp": "2025-09-30 22:11:16.231912", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.293739", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.05426352098584175, "timestamp": "2025-09-30 22:11:16.297126", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.352232", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.023926403373479843, "timestamp": "2025-09-30 22:11:16.365565", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:16.430875", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.021941417828202248, "timestamp": "2025-09-30 22:11:16.439585", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.498898", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.022387471050024033, "timestamp": "2025-09-30 22:11:16.503554", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.559981", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.034199006855487823, "timestamp": "2025-09-30 22:11:16.563770", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.626143", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.03138425573706627, "timestamp": "2025-09-30 22:11:16.633281", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:16.699866", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.01781405135989189, "timestamp": "2025-09-30 22:11:16.711858", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.774422", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.01842852309346199, "timestamp": "2025-09-30 22:11:16.778088", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.837439", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.015847353264689445, "timestamp": "2025-09-30 22:11:16.840628", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:16.905813", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.023063872009515762, "timestamp": "2025-09-30 22:11:16.922447", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-30 22:11:17.342366", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.406168", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.01709248684346676, "timestamp": "2025-09-30 22:11:17.410020", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.475350", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.020289698615670204, "timestamp": "2025-09-30 22:11:17.479027", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.537016", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 0.008975411765277386, "timestamp": "2025-09-30 22:11:17.540174", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.612218", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.008156144060194492, "timestamp": "2025-09-30 22:11:17.625039", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.681030", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.01333966851234436, "timestamp": "2025-09-30 22:11:17.684917", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.739488", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.017511749640107155, "timestamp": "2025-09-30 22:11:17.749395", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:17.812097", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.015036171302199364, "timestamp": "2025-09-30 22:11:17.814729", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.870282", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.027377020567655563, "timestamp": "2025-09-30 22:11:17.884203", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:17.939068", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.025792699307203293, "timestamp": "2025-09-30 22:11:17.941916", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:17.997774", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.014380039647221565, "timestamp": "2025-09-30 22:11:18.000089", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.061728", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.010104767978191376, "timestamp": "2025-09-30 22:11:18.065231", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.119492", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.026348626241087914, "timestamp": "2025-09-30 22:11:18.136369", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.198516", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.04897860437631607, "timestamp": "2025-09-30 22:11:18.202468", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.256848", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.022462058812379837, "timestamp": "2025-09-30 22:11:18.260532", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.315123", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.01851494610309601, "timestamp": "2025-09-30 22:11:18.318588", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.372184", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.01477868389338255, "timestamp": "2025-09-30 22:11:18.380950", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.435393", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.024721411988139153, "timestamp": "2025-09-30 22:11:18.439119", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.494363", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.01894466020166874, "timestamp": "2025-09-30 22:11:18.497219", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.558038", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 0.02108973078429699, "timestamp": "2025-09-30 22:11:18.562002", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.626079", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.014491016045212746, "timestamp": "2025-09-30 22:11:18.639370", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.695578", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.01586849056184292, "timestamp": "2025-09-30 22:11:18.704763", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.758234", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.024214720353484154, "timestamp": "2025-09-30 22:11:18.761605", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.815061", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.01589794084429741, "timestamp": "2025-09-30 22:11:18.820225", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:18.875416", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.013331546448171139, "timestamp": "2025-09-30 22:11:18.882243", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:18.936663", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.024225672706961632, "timestamp": "2025-09-30 22:11:18.940125", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:18.996413", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.025553515180945396, "timestamp": "2025-09-30 22:11:19.013028", "step": 1026, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:20.209695", "step": 1026, "epoch": 2 }, { "type": "pplx", "content": 28815333.87535932, "timestamp": "2025-09-30 22:11:20.215325", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:20.269869", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.03137680143117905, "timestamp": "2025-09-30 22:11:20.275244", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:20.330426", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.012752371840178967, "timestamp": "2025-09-30 22:11:20.338824", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:20.392801", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.01019949372857809, "timestamp": "2025-09-30 22:11:20.395750", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:20.449777", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.00877032708376646, "timestamp": "2025-09-30 22:11:20.454485", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:20.510207", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.011759008280932903, "timestamp": "2025-09-30 22:11:20.513472", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:20.567688", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.011547918431460857, "timestamp": "2025-09-30 22:11:20.574407", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:20.629320", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.010614079423248768, "timestamp": "2025-09-30 22:11:20.631833", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:20.693831", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.021462971344590187, "timestamp": "2025-09-30 22:11:20.698081", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:20.752860", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.012604753486812115, "timestamp": "2025-09-30 22:11:20.755848", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:20.811454", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.0071957902982831, "timestamp": "2025-09-30 22:11:20.818646", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:20.871794", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.007674416061490774, "timestamp": "2025-09-30 22:11:20.875567", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:20.930788", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.030532391741871834, "timestamp": "2025-09-30 22:11:20.935486", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:20.997287", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.020820874720811844, "timestamp": "2025-09-30 22:11:21.000735", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:21.056898", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.02545594982802868, "timestamp": "2025-09-30 22:11:21.074256", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.132305", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.009167616255581379, "timestamp": "2025-09-30 22:11:21.135789", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.193450", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.009028018452227116, "timestamp": "2025-09-30 22:11:21.197632", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.257441", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.05561920627951622, "timestamp": "2025-09-30 22:11:21.261241", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:21.317907", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.010370013304054737, "timestamp": "2025-09-30 22:11:21.325239", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:21.389422", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.00789992231875658, "timestamp": "2025-09-30 22:11:21.396542", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.458698", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.015907449647784233, "timestamp": "2025-09-30 22:11:21.461609", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.522637", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.03674934431910515, "timestamp": "2025-09-30 22:11:21.525264", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.583737", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.01920946128666401, "timestamp": "2025-09-30 22:11:21.590229", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:21.645190", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 0.009556346572935581, "timestamp": "2025-09-30 22:11:21.653972", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:21.718981", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.007382354233413935, "timestamp": "2025-09-30 22:11:21.723397", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.787348", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.013103391043841839, "timestamp": "2025-09-30 22:11:21.796826", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:21.859537", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.012127166613936424, "timestamp": "2025-09-30 22:11:21.870499", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:21.929892", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.015366926789283752, "timestamp": "2025-09-30 22:11:21.938332", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:21.999245", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.014787280932068825, "timestamp": "2025-09-30 22:11:22.008025", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.068794", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.03266071528196335, "timestamp": "2025-09-30 22:11:22.072613", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.132559", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.02072693593800068, "timestamp": "2025-09-30 22:11:22.144141", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.204106", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.011774016544222832, "timestamp": "2025-09-30 22:11:22.213022", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.273981", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.016940701752901077, "timestamp": "2025-09-30 22:11:22.279224", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.337089", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.022047163918614388, "timestamp": "2025-09-30 22:11:22.347032", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.420003", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.011164650321006775, "timestamp": "2025-09-30 22:11:22.433830", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.504073", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.011306509375572205, "timestamp": "2025-09-30 22:11:22.506988", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.569632", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.012297945097088814, "timestamp": "2025-09-30 22:11:22.578635", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.642090", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.005849027074873447, "timestamp": "2025-09-30 22:11:22.645365", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.702258", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.009956971742212772, "timestamp": "2025-09-30 22:11:22.708813", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.767276", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.02906261757016182, "timestamp": "2025-09-30 22:11:22.770910", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:22.831779", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.034898433834314346, "timestamp": "2025-09-30 22:11:22.842828", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.919431", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 0.02677224576473236, "timestamp": "2025-09-30 22:11:22.922878", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:22.981516", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 0.011528550647199154, "timestamp": "2025-09-30 22:11:22.990009", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.046267", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 0.005817048251628876, "timestamp": "2025-09-30 22:11:23.054922", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:23.110207", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.014719395898282528, "timestamp": "2025-09-30 22:11:23.113003", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:23.170356", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 0.02249346859753132, "timestamp": "2025-09-30 22:11:23.173044", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.226950", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.035828422755002975, "timestamp": "2025-09-30 22:11:23.233136", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.286002", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.010695637203752995, "timestamp": "2025-09-30 22:11:23.288722", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.345359", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.0401848740875721, "timestamp": "2025-09-30 22:11:23.352471", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.413270", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.05746329203248024, "timestamp": "2025-09-30 22:11:23.416088", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:23.479871", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.04341733828186989, "timestamp": "2025-09-30 22:11:23.494593", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.552516", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.02323761023581028, "timestamp": "2025-09-30 22:11:23.555673", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.610535", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.02646273747086525, "timestamp": "2025-09-30 22:11:23.617313", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.679251", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.017116302624344826, "timestamp": "2025-09-30 22:11:23.684229", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.739927", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 0.018749186769127846, "timestamp": "2025-09-30 22:11:23.757600", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:23.813780", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.025141606107354164, "timestamp": "2025-09-30 22:11:23.828231", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:23.890960", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.026854228228330612, "timestamp": "2025-09-30 22:11:23.895100", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:23.953254", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.019390176981687546, "timestamp": "2025-09-30 22:11:23.956489", "step": 1083, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:25.186778", "step": 1083, "epoch": 2 }, { "type": "pplx", "content": 30432312.81053753, "timestamp": "2025-09-30 22:11:25.198564", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.261335", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.01611759141087532, "timestamp": "2025-09-30 22:11:25.269310", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.325196", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.006585521157830954, "timestamp": "2025-09-30 22:11:25.329013", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.389748", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 0.01776345819234848, "timestamp": "2025-09-30 22:11:25.393830", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.449930", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.02161712571978569, "timestamp": "2025-09-30 22:11:25.462318", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:25.525802", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.008740507997572422, "timestamp": "2025-09-30 22:11:25.533504", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:25.591122", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.03131970018148422, "timestamp": "2025-09-30 22:11:25.595109", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.649810", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.021836699917912483, "timestamp": "2025-09-30 22:11:25.652709", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:25.715906", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.02508729137480259, "timestamp": "2025-09-30 22:11:25.727891", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:25.791875", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 0.013201175257563591, "timestamp": "2025-09-30 22:11:25.799459", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:25.854161", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.025001246482133865, "timestamp": "2025-09-30 22:11:25.860494", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:25.918284", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.018011152744293213, "timestamp": "2025-09-30 22:11:25.922173", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:25.977889", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.015894226729869843, "timestamp": "2025-09-30 22:11:25.980986", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.037531", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.008476397953927517, "timestamp": "2025-09-30 22:11:26.044321", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:26.097394", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.017200523987412453, "timestamp": "2025-09-30 22:11:26.100930", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.158557", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.020911503583192825, "timestamp": "2025-09-30 22:11:26.161645", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.216231", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.0157220046967268, "timestamp": "2025-09-30 22:11:26.228058", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.283086", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.020792776718735695, "timestamp": "2025-09-30 22:11:26.291051", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:26.345330", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.026384010910987854, "timestamp": "2025-09-30 22:11:26.349217", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:26.404260", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.00842324923723936, "timestamp": "2025-09-30 22:11:26.408124", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.468374", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.018321281298995018, "timestamp": "2025-09-30 22:11:26.480784", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.545071", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.016511019319295883, "timestamp": "2025-09-30 22:11:26.556678", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.610703", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.010460903868079185, "timestamp": "2025-09-30 22:11:26.622778", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:26.687316", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.00898995902389288, "timestamp": "2025-09-30 22:11:26.690735", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.759562", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.027517573907971382, "timestamp": "2025-09-30 22:11:26.764616", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:26.821596", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.021612342447042465, "timestamp": "2025-09-30 22:11:26.839263", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:26.895507", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.009821252897381783, "timestamp": "2025-09-30 22:11:26.899450", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:26.955573", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.018084069713950157, "timestamp": "2025-09-30 22:11:26.958415", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.012521", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.014403236098587513, "timestamp": "2025-09-30 22:11:27.016372", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.070513", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.02946326695382595, "timestamp": "2025-09-30 22:11:27.085522", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.139790", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.01968367025256157, "timestamp": "2025-09-30 22:11:27.143655", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.200252", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.00848131999373436, "timestamp": "2025-09-30 22:11:27.204535", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.268243", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.021036352962255478, "timestamp": "2025-09-30 22:11:27.271299", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:27.325982", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.023403389379382133, "timestamp": "2025-09-30 22:11:27.332962", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.396211", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.023917311802506447, "timestamp": "2025-09-30 22:11:27.401426", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.455716", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.013939259573817253, "timestamp": "2025-09-30 22:11:27.458932", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.515666", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.0158989354968071, "timestamp": "2025-09-30 22:11:27.519650", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.575322", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.02595258131623268, "timestamp": "2025-09-30 22:11:27.583748", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:27.640282", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.021582497283816338, "timestamp": "2025-09-30 22:11:27.644484", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:27.699420", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.005752278957515955, "timestamp": "2025-09-30 22:11:27.704823", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.765007", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.010556980967521667, "timestamp": "2025-09-30 22:11:27.767848", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.823206", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.01841617561876774, "timestamp": "2025-09-30 22:11:27.835200", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.898988", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.00747791538015008, "timestamp": "2025-09-30 22:11:27.903636", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:27.958240", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.023193703964352608, "timestamp": "2025-09-30 22:11:27.969994", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.033276", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 0.03435632213950157, "timestamp": "2025-09-30 22:11:28.037037", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.092858", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.006761268712580204, "timestamp": "2025-09-30 22:11:28.109904", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.175862", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.055748604238033295, "timestamp": "2025-09-30 22:11:28.189431", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.252544", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.035190846771001816, "timestamp": "2025-09-30 22:11:28.257262", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.322185", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.01312336977571249, "timestamp": "2025-09-30 22:11:28.334537", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.400136", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.01045858021825552, "timestamp": "2025-09-30 22:11:28.409011", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.471113", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.028751153498888016, "timestamp": "2025-09-30 22:11:28.483658", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:28.550451", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.01273017842322588, "timestamp": "2025-09-30 22:11:28.563256", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:28.619462", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.0197049081325531, "timestamp": "2025-09-30 22:11:28.623295", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.688139", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.007282760459929705, "timestamp": "2025-09-30 22:11:28.695520", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.766569", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.007011772133409977, "timestamp": "2025-09-30 22:11:28.778955", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.834229", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.01817786693572998, "timestamp": "2025-09-30 22:11:28.838713", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.894352", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.009522279724478722, "timestamp": "2025-09-30 22:11:28.904447", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:28.958831", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.015000018291175365, "timestamp": "2025-09-30 22:11:28.976470", "step": 1140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:30.202755", "step": 1140, "epoch": 2 }, { "type": "pplx", "content": 31761433.22399976, "timestamp": "2025-09-30 22:11:30.207738", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.261288", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.023212244734168053, "timestamp": "2025-09-30 22:11:30.264791", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.320013", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.005753274541348219, "timestamp": "2025-09-30 22:11:30.324165", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.383960", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.016708120703697205, "timestamp": "2025-09-30 22:11:30.388811", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.453550", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.03149080649018288, "timestamp": "2025-09-30 22:11:30.460546", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:30.515308", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.0065864152275025845, "timestamp": "2025-09-30 22:11:30.519706", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:30.575127", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.015503957867622375, "timestamp": "2025-09-30 22:11:30.578397", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.633640", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.011548278853297234, "timestamp": "2025-09-30 22:11:30.639347", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.695135", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.014130154624581337, "timestamp": "2025-09-30 22:11:30.710275", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.764519", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.022189956158399582, "timestamp": "2025-09-30 22:11:30.769789", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:30.824056", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.01809052750468254, "timestamp": "2025-09-30 22:11:30.828934", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.884363", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.0377984382212162, "timestamp": "2025-09-30 22:11:30.888947", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:30.943518", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.014366830699145794, "timestamp": "2025-09-30 22:11:30.953335", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.007608", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.019895801320672035, "timestamp": "2025-09-30 22:11:31.017945", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:31.079661", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.01277296245098114, "timestamp": "2025-09-30 22:11:31.086770", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.140713", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 0.027852704748511314, "timestamp": "2025-09-30 22:11:31.146979", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.200899", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.01802685298025608, "timestamp": "2025-09-30 22:11:31.207518", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.261180", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 0.0185939259827137, "timestamp": "2025-09-30 22:11:31.264827", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:31.321946", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 0.00952514261007309, "timestamp": "2025-09-30 22:11:31.325302", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.380426", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.014004341326653957, "timestamp": "2025-09-30 22:11:31.383957", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.437605", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.011020747944712639, "timestamp": "2025-09-30 22:11:31.444549", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:31.508814", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.007963932119309902, "timestamp": "2025-09-30 22:11:31.512980", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.568675", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.029965534806251526, "timestamp": "2025-09-30 22:11:31.574911", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.644846", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.012620776891708374, "timestamp": "2025-09-30 22:11:31.658105", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:31.715055", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.017773713916540146, "timestamp": "2025-09-30 22:11:31.722580", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.776212", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.015509136021137238, "timestamp": "2025-09-30 22:11:31.780116", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.834854", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.01890859194099903, "timestamp": "2025-09-30 22:11:31.837981", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:31.904513", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.010066986083984375, "timestamp": "2025-09-30 22:11:31.911328", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:31.965873", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.03768039494752884, "timestamp": "2025-09-30 22:11:31.974127", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.038694", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.01412307471036911, "timestamp": "2025-09-30 22:11:32.042393", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:32.097717", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.005952424369752407, "timestamp": "2025-09-30 22:11:32.105721", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:32.162544", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.02057734504342079, "timestamp": "2025-09-30 22:11:32.166457", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:32.224240", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 0.008706753142178059, "timestamp": "2025-09-30 22:11:32.231341", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:32.285741", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.011364555917680264, "timestamp": "2025-09-30 22:11:32.289356", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.355027", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.008187648840248585, "timestamp": "2025-09-30 22:11:32.369898", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.433314", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.01807296834886074, "timestamp": "2025-09-30 22:11:32.445908", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.500680", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.009041151963174343, "timestamp": "2025-09-30 22:11:32.517829", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:32.575103", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.022169558331370354, "timestamp": "2025-09-30 22:11:32.587014", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.661750", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.009721535257995129, "timestamp": "2025-09-30 22:11:32.674137", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.737930", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.029778840020298958, "timestamp": "2025-09-30 22:11:32.741818", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:32.799497", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.012037808075547218, "timestamp": "2025-09-30 22:11:32.808244", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:32.863512", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.028365235775709152, "timestamp": "2025-09-30 22:11:32.875714", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:32.944132", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.03483065962791443, "timestamp": "2025-09-30 22:11:32.947386", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.001792", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.018574940040707588, "timestamp": "2025-09-30 22:11:33.005179", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:33.062620", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.005325800273567438, "timestamp": "2025-09-30 22:11:33.069033", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:33.130798", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.020132439211010933, "timestamp": "2025-09-30 22:11:33.141806", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:33.198157", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.01631121151149273, "timestamp": "2025-09-30 22:11:33.210341", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.273179", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.004564606584608555, "timestamp": "2025-09-30 22:11:33.278063", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.332605", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.014193967916071415, "timestamp": "2025-09-30 22:11:33.347094", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:33.409336", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.0075381542555987835, "timestamp": "2025-09-30 22:11:33.412982", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.466666", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.042488861829042435, "timestamp": "2025-09-30 22:11:33.471047", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:33.534611", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.007255176547914743, "timestamp": "2025-09-30 22:11:33.545876", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.602630", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 0.02486496791243553, "timestamp": "2025-09-30 22:11:33.611199", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:33.673991", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.018168171867728233, "timestamp": "2025-09-30 22:11:33.679538", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.735190", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 0.0016164934495463967, "timestamp": "2025-09-30 22:11:33.740699", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:33.799993", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.0472748838365078, "timestamp": "2025-09-30 22:11:33.813257", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:33.875956", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.018774310126900673, "timestamp": "2025-09-30 22:11:33.883590", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:33.938359", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 0.054423097521066666, "timestamp": "2025-09-30 22:11:33.949340", "step": 1197, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:35.187868", "step": 1197, "epoch": 2 }, { "type": "pplx", "content": 32067022.826058734, "timestamp": "2025-09-30 22:11:35.192378", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.253335", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.007513918448239565, "timestamp": "2025-09-30 22:11:35.256670", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.314883", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 0.014755907468497753, "timestamp": "2025-09-30 22:11:35.318113", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.378580", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.021854715421795845, "timestamp": "2025-09-30 22:11:35.390404", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:35.449304", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.041134320199489594, "timestamp": "2025-09-30 22:11:35.452074", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.516034", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.005287197418510914, "timestamp": "2025-09-30 22:11:35.520991", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.577432", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.0038279914297163486, "timestamp": "2025-09-30 22:11:35.581096", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:35.637823", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.003830043599009514, "timestamp": "2025-09-30 22:11:35.646802", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:35.700118", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 0.032837968319654465, "timestamp": "2025-09-30 22:11:35.704811", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.760611", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.03423704952001572, "timestamp": "2025-09-30 22:11:35.763826", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.823186", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.008218946866691113, "timestamp": "2025-09-30 22:11:35.826450", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.880558", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.01721678301692009, "timestamp": "2025-09-30 22:11:35.887864", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:35.941792", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.019656702876091003, "timestamp": "2025-09-30 22:11:35.944512", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:35.998526", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.029059452936053276, "timestamp": "2025-09-30 22:11:36.003965", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.058525", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.030055100098252296, "timestamp": "2025-09-30 22:11:36.061611", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.117192", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.02078065276145935, "timestamp": "2025-09-30 22:11:36.130008", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:36.184580", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.010412467643618584, "timestamp": "2025-09-30 22:11:36.193851", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.250863", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.014479429461061954, "timestamp": "2025-09-30 22:11:36.254945", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.312801", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 0.016420889645814896, "timestamp": "2025-09-30 22:11:36.315643", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:36.377191", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.02330826409161091, "timestamp": "2025-09-30 22:11:36.383470", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:36.437724", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 0.018506629392504692, "timestamp": "2025-09-30 22:11:36.450976", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.505571", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.006810951977968216, "timestamp": "2025-09-30 22:11:36.509502", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:36.567290", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.014696040190756321, "timestamp": "2025-09-30 22:11:36.574004", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.631116", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.012312375009059906, "timestamp": "2025-09-30 22:11:36.637927", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.694225", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 0.007672054227441549, "timestamp": "2025-09-30 22:11:36.705527", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.763351", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.008700719103217125, "timestamp": "2025-09-30 22:11:36.768175", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.821917", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.009297819808125496, "timestamp": "2025-09-30 22:11:36.824822", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.879397", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.01288828905671835, "timestamp": "2025-09-30 22:11:36.890240", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:36.955515", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.018189510330557823, "timestamp": "2025-09-30 22:11:36.958618", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.012393", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.003722636727616191, "timestamp": "2025-09-30 22:11:37.025019", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.080604", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.014420069754123688, "timestamp": "2025-09-30 22:11:37.087616", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.142487", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.017777537927031517, "timestamp": "2025-09-30 22:11:37.148574", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.202651", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.005553639028221369, "timestamp": "2025-09-30 22:11:37.205175", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.269419", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.02909952774643898, "timestamp": "2025-09-30 22:11:37.272973", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.327626", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 0.008751442655920982, "timestamp": "2025-09-30 22:11:37.331343", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.386264", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.022108396515250206, "timestamp": "2025-09-30 22:11:37.393911", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.447858", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.0038794793654233217, "timestamp": "2025-09-30 22:11:37.454350", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.517045", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.009794176556169987, "timestamp": "2025-09-30 22:11:37.520053", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:37.575368", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.02226036600768566, "timestamp": "2025-09-30 22:11:37.578470", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.638983", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 0.011220982298254967, "timestamp": "2025-09-30 22:11:37.645391", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:37.705527", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.010935097001492977, "timestamp": "2025-09-30 22:11:37.717267", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:37.771780", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.015008168295025826, "timestamp": "2025-09-30 22:11:37.774709", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.828421", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.03053663857281208, "timestamp": "2025-09-30 22:11:37.831613", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:37.887977", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.02169874683022499, "timestamp": "2025-09-30 22:11:37.894966", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:37.949789", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.012753850780427456, "timestamp": "2025-09-30 22:11:37.952719", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:38.006491", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.013386455364525318, "timestamp": "2025-09-30 22:11:38.009169", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.063334", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.029180806130170822, "timestamp": "2025-09-30 22:11:38.067437", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.122343", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.02196519263088703, "timestamp": "2025-09-30 22:11:38.135050", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:38.188177", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.02870265766978264, "timestamp": "2025-09-30 22:11:38.190963", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:38.245249", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.023608211427927017, "timestamp": "2025-09-30 22:11:38.257228", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.322615", "step": 1246, "epoch": 2 }, { "type": "loss", "content": 0.032148636877536774, "timestamp": "2025-09-30 22:11:38.326587", "step": 1247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:38.380869", "step": 1247, "epoch": 2 }, { "type": "loss", "content": 0.014642714522778988, "timestamp": "2025-09-30 22:11:38.387162", "step": 1248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.446848", "step": 1248, "epoch": 2 }, { "type": "loss", "content": 0.027036087587475777, "timestamp": "2025-09-30 22:11:38.450541", "step": 1249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:38.505596", "step": 1249, "epoch": 2 }, { "type": "loss", "content": 0.02789353020489216, "timestamp": "2025-09-30 22:11:38.518420", "step": 1250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.575018", "step": 1250, "epoch": 2 }, { "type": "loss", "content": 0.02393009327352047, "timestamp": "2025-09-30 22:11:38.578596", "step": 1251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:38.632056", "step": 1251, "epoch": 2 }, { "type": "loss", "content": 0.010679802857339382, "timestamp": "2025-09-30 22:11:38.638585", "step": 1252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:38.693945", "step": 1252, "epoch": 2 }, { "type": "loss", "content": 0.005615527741611004, "timestamp": "2025-09-30 22:11:38.697404", "step": 1253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:38.753186", "step": 1253, "epoch": 2 }, { "type": "loss", "content": 0.008227822370827198, "timestamp": "2025-09-30 22:11:38.756726", "step": 1254, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:39.978401", "step": 1254, "epoch": 2 }, { "type": "pplx", "content": 30062323.890613366, "timestamp": "2025-09-30 22:11:39.989728", "step": 1254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.042732", "step": 1254, "epoch": 2 }, { "type": "loss", "content": 0.007192269433289766, "timestamp": "2025-09-30 22:11:40.046046", "step": 1255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.100429", "step": 1255, "epoch": 2 }, { "type": "loss", "content": 0.026882369071245193, "timestamp": "2025-09-30 22:11:40.106496", "step": 1256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.159745", "step": 1256, "epoch": 2 }, { "type": "loss", "content": 0.01234396081417799, "timestamp": "2025-09-30 22:11:40.164224", "step": 1257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:40.220058", "step": 1257, "epoch": 2 }, { "type": "loss", "content": 0.02563643269240856, "timestamp": "2025-09-30 22:11:40.223694", "step": 1258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.279550", "step": 1258, "epoch": 2 }, { "type": "loss", "content": 0.008174607530236244, "timestamp": "2025-09-30 22:11:40.283730", "step": 1259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.344881", "step": 1259, "epoch": 2 }, { "type": "loss", "content": 0.015055462718009949, "timestamp": "2025-09-30 22:11:40.351884", "step": 1260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.404793", "step": 1260, "epoch": 2 }, { "type": "loss", "content": 0.044176094233989716, "timestamp": "2025-09-30 22:11:40.407606", "step": 1261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.462171", "step": 1261, "epoch": 2 }, { "type": "loss", "content": 0.012365161441266537, "timestamp": "2025-09-30 22:11:40.466037", "step": 1262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.526018", "step": 1262, "epoch": 2 }, { "type": "loss", "content": 0.021339308470487595, "timestamp": "2025-09-30 22:11:40.539809", "step": 1263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.594211", "step": 1263, "epoch": 2 }, { "type": "loss", "content": 0.013789476826786995, "timestamp": "2025-09-30 22:11:40.611854", "step": 1264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.676229", "step": 1264, "epoch": 2 }, { "type": "loss", "content": 0.010516809299588203, "timestamp": "2025-09-30 22:11:40.679039", "step": 1265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:40.733124", "step": 1265, "epoch": 2 }, { "type": "loss", "content": 0.01804107055068016, "timestamp": "2025-09-30 22:11:40.736506", "step": 1266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.798080", "step": 1266, "epoch": 2 }, { "type": "loss", "content": 0.016653243452310562, "timestamp": "2025-09-30 22:11:40.801372", "step": 1267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.863841", "step": 1267, "epoch": 2 }, { "type": "loss", "content": 0.023181870579719543, "timestamp": "2025-09-30 22:11:40.870269", "step": 1268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.923609", "step": 1268, "epoch": 2 }, { "type": "loss", "content": 0.016313303261995316, "timestamp": "2025-09-30 22:11:40.926436", "step": 1269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:40.981079", "step": 1269, "epoch": 2 }, { "type": "loss", "content": 0.018928783014416695, "timestamp": "2025-09-30 22:11:40.994550", "step": 1270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:41.049139", "step": 1270, "epoch": 2 }, { "type": "loss", "content": 0.009127925150096416, "timestamp": "2025-09-30 22:11:41.057956", "step": 1271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.120907", "step": 1271, "epoch": 2 }, { "type": "loss", "content": 0.02112032100558281, "timestamp": "2025-09-30 22:11:41.128416", "step": 1272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.182515", "step": 1272, "epoch": 2 }, { "type": "loss", "content": 0.009894484654068947, "timestamp": "2025-09-30 22:11:41.187843", "step": 1273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:41.249575", "step": 1273, "epoch": 2 }, { "type": "loss", "content": 0.022361796349287033, "timestamp": "2025-09-30 22:11:41.252800", "step": 1274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.313068", "step": 1274, "epoch": 2 }, { "type": "loss", "content": 0.011399311013519764, "timestamp": "2025-09-30 22:11:41.317342", "step": 1275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.371745", "step": 1275, "epoch": 2 }, { "type": "loss", "content": 0.009731536731123924, "timestamp": "2025-09-30 22:11:41.378459", "step": 1276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:41.433288", "step": 1276, "epoch": 2 }, { "type": "loss", "content": 0.015632973983883858, "timestamp": "2025-09-30 22:11:41.436646", "step": 1277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:41.497498", "step": 1277, "epoch": 2 }, { "type": "loss", "content": 0.012282473966479301, "timestamp": "2025-09-30 22:11:41.501010", "step": 1278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.554689", "step": 1278, "epoch": 2 }, { "type": "loss", "content": 0.03517066687345505, "timestamp": "2025-09-30 22:11:41.558237", "step": 1279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:41.613087", "step": 1279, "epoch": 2 }, { "type": "loss", "content": 0.018857363611459732, "timestamp": "2025-09-30 22:11:41.629174", "step": 1280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.684409", "step": 1280, "epoch": 2 }, { "type": "loss", "content": 0.009674946777522564, "timestamp": "2025-09-30 22:11:41.698238", "step": 1281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.757905", "step": 1281, "epoch": 2 }, { "type": "loss", "content": 0.01863682270050049, "timestamp": "2025-09-30 22:11:41.761075", "step": 1282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:41.818226", "step": 1282, "epoch": 2 }, { "type": "loss", "content": 0.00765694584697485, "timestamp": "2025-09-30 22:11:41.821824", "step": 1283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:41.876313", "step": 1283, "epoch": 2 }, { "type": "loss", "content": 0.009559721685945988, "timestamp": "2025-09-30 22:11:41.883478", "step": 1284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.936511", "step": 1284, "epoch": 2 }, { "type": "loss", "content": 0.024069128558039665, "timestamp": "2025-09-30 22:11:41.939872", "step": 1285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:41.993918", "step": 1285, "epoch": 2 }, { "type": "loss", "content": 0.006193791516125202, "timestamp": "2025-09-30 22:11:41.998037", "step": 1286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:42.053332", "step": 1286, "epoch": 2 }, { "type": "loss", "content": 0.015762005001306534, "timestamp": "2025-09-30 22:11:42.063465", "step": 1287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.134164", "step": 1287, "epoch": 2 }, { "type": "loss", "content": 0.005604383070021868, "timestamp": "2025-09-30 22:11:42.143673", "step": 1288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.197288", "step": 1288, "epoch": 2 }, { "type": "loss", "content": 0.03566458821296692, "timestamp": "2025-09-30 22:11:42.201038", "step": 1289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:42.256727", "step": 1289, "epoch": 2 }, { "type": "loss", "content": 0.012333127669990063, "timestamp": "2025-09-30 22:11:42.260609", "step": 1290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.316139", "step": 1290, "epoch": 2 }, { "type": "loss", "content": 0.020386451855301857, "timestamp": "2025-09-30 22:11:42.321025", "step": 1291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.375757", "step": 1291, "epoch": 2 }, { "type": "loss", "content": 0.013080189004540443, "timestamp": "2025-09-30 22:11:42.382534", "step": 1292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.444980", "step": 1292, "epoch": 2 }, { "type": "loss", "content": 0.004526190459728241, "timestamp": "2025-09-30 22:11:42.448558", "step": 1293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.503079", "step": 1293, "epoch": 2 }, { "type": "loss", "content": 0.0055126287043094635, "timestamp": "2025-09-30 22:11:42.506004", "step": 1294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.572615", "step": 1294, "epoch": 2 }, { "type": "loss", "content": 0.005155415739864111, "timestamp": "2025-09-30 22:11:42.575746", "step": 1295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.629780", "step": 1295, "epoch": 2 }, { "type": "loss", "content": 0.02876908890902996, "timestamp": "2025-09-30 22:11:42.637312", "step": 1296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:42.692052", "step": 1296, "epoch": 2 }, { "type": "loss", "content": 0.019892612472176552, "timestamp": "2025-09-30 22:11:42.695367", "step": 1297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.750139", "step": 1297, "epoch": 2 }, { "type": "loss", "content": 0.024674100801348686, "timestamp": "2025-09-30 22:11:42.753989", "step": 1298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:42.809884", "step": 1298, "epoch": 2 }, { "type": "loss", "content": 0.024183692410588264, "timestamp": "2025-09-30 22:11:42.822187", "step": 1299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:42.880441", "step": 1299, "epoch": 2 }, { "type": "loss", "content": 0.0076071759685873985, "timestamp": "2025-09-30 22:11:42.887514", "step": 1300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:42.945521", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.014696098864078522, "timestamp": "2025-09-30 22:11:42.956393", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:43.020195", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.009948963299393654, "timestamp": "2025-09-30 22:11:43.024109", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.079274", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.018407296389341354, "timestamp": "2025-09-30 22:11:43.084424", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.141234", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.006092921365052462, "timestamp": "2025-09-30 22:11:43.148848", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.213109", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.011458395048975945, "timestamp": "2025-09-30 22:11:43.216786", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:43.270249", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.00822626892477274, "timestamp": "2025-09-30 22:11:43.273048", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.328334", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.016102908179163933, "timestamp": "2025-09-30 22:11:43.332627", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.388206", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.013590490445494652, "timestamp": "2025-09-30 22:11:43.395291", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.449410", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.005517884157598019, "timestamp": "2025-09-30 22:11:43.456366", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:43.510445", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.014826023019850254, "timestamp": "2025-09-30 22:11:43.513652", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:43.577385", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.019576190039515495, "timestamp": "2025-09-30 22:11:43.580375", "step": 1311, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:44.824236", "step": 1311, "epoch": 2 }, { "type": "pplx", "content": 32609117.843413066, "timestamp": "2025-09-30 22:11:44.828325", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:44.880343", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.004271751269698143, "timestamp": "2025-09-30 22:11:44.887291", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:44.943672", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.00479540228843689, "timestamp": "2025-09-30 22:11:44.947951", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.004542", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.0011680542957037687, "timestamp": "2025-09-30 22:11:45.014421", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:45.068628", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.014515669085085392, "timestamp": "2025-09-30 22:11:45.071802", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.125654", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.008202152326703072, "timestamp": "2025-09-30 22:11:45.133686", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.188161", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.012495539151132107, "timestamp": "2025-09-30 22:11:45.192099", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.246718", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.0031438537407666445, "timestamp": "2025-09-30 22:11:45.250209", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:45.309314", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.018467547371983528, "timestamp": "2025-09-30 22:11:45.312481", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:45.378622", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.005382470320910215, "timestamp": "2025-09-30 22:11:45.389200", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.449995", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.04153914004564285, "timestamp": "2025-09-30 22:11:45.453345", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:45.506615", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.023785650730133057, "timestamp": "2025-09-30 22:11:45.509560", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:45.565186", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.010426363907754421, "timestamp": "2025-09-30 22:11:45.569391", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.623237", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.007811260875314474, "timestamp": "2025-09-30 22:11:45.631937", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:45.684753", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.00887636374682188, "timestamp": "2025-09-30 22:11:45.689437", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.743278", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.007312100380659103, "timestamp": "2025-09-30 22:11:45.746821", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.801232", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.003930172882974148, "timestamp": "2025-09-30 22:11:45.804113", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:45.866226", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.011160810478031635, "timestamp": "2025-09-30 22:11:45.881512", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:45.935904", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.017904015257954597, "timestamp": "2025-09-30 22:11:45.942772", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:45.997789", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.00545323733240366, "timestamp": "2025-09-30 22:11:46.006031", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.062131", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.04627959802746773, "timestamp": "2025-09-30 22:11:46.066083", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.120957", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.02013224922120571, "timestamp": "2025-09-30 22:11:46.127254", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.180087", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.01763824000954628, "timestamp": "2025-09-30 22:11:46.183576", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.238119", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.01840882934629917, "timestamp": "2025-09-30 22:11:46.241142", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.302685", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.029163610190153122, "timestamp": "2025-09-30 22:11:46.306239", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.365479", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.005280309822410345, "timestamp": "2025-09-30 22:11:46.372879", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:46.427591", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.016217708587646484, "timestamp": "2025-09-30 22:11:46.430872", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.485684", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.016818564385175705, "timestamp": "2025-09-30 22:11:46.491284", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:46.566008", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.005953342653810978, "timestamp": "2025-09-30 22:11:46.569427", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:46.626744", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.019312188029289246, "timestamp": "2025-09-30 22:11:46.633310", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:46.694266", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.014265080913901329, "timestamp": "2025-09-30 22:11:46.697938", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:46.759167", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.014429310336709023, "timestamp": "2025-09-30 22:11:46.762088", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:46.833476", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.01076581608504057, "timestamp": "2025-09-30 22:11:46.843812", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.909121", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.019898299127817154, "timestamp": "2025-09-30 22:11:46.917240", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:46.977775", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.0010426754597574472, "timestamp": "2025-09-30 22:11:46.981247", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.039218", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.022380618378520012, "timestamp": "2025-09-30 22:11:47.045477", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:47.100496", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.012452373281121254, "timestamp": "2025-09-30 22:11:47.106816", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.161611", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.002341157989576459, "timestamp": "2025-09-30 22:11:47.169873", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:47.224614", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.002771148458123207, "timestamp": "2025-09-30 22:11:47.227269", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.282287", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.010354334488511086, "timestamp": "2025-09-30 22:11:47.285453", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:47.347031", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.009396574459969997, "timestamp": "2025-09-30 22:11:47.349952", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.406731", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.003213896183297038, "timestamp": "2025-09-30 22:11:47.413957", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.469778", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.004314431454986334, "timestamp": "2025-09-30 22:11:47.473353", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.529789", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.013174169696867466, "timestamp": "2025-09-30 22:11:47.533184", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.590717", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.015209296718239784, "timestamp": "2025-09-30 22:11:47.594409", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.650799", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.038054872304201126, "timestamp": "2025-09-30 22:11:47.658833", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.712960", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.009656942449510098, "timestamp": "2025-09-30 22:11:47.716647", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.777202", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.01215650700032711, "timestamp": "2025-09-30 22:11:47.786134", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:47.846371", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.015105058439075947, "timestamp": "2025-09-30 22:11:47.854309", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:47.914321", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.002900704275816679, "timestamp": "2025-09-30 22:11:47.926334", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:47.980560", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.020952101796865463, "timestamp": "2025-09-30 22:11:47.987594", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:48.043298", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.015315545722842216, "timestamp": "2025-09-30 22:11:48.050262", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:48.107218", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.0064353845082223415, "timestamp": "2025-09-30 22:11:48.110270", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:48.169342", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.006530344020575285, "timestamp": "2025-09-30 22:11:48.179583", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:48.241263", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.017590684816241264, "timestamp": "2025-09-30 22:11:48.249981", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:48.310235", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.007951964624226093, "timestamp": "2025-09-30 22:11:48.318949", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:48.374804", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.01660231314599514, "timestamp": "2025-09-30 22:11:48.377100", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:48.438207", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.007098283153027296, "timestamp": "2025-09-30 22:11:48.445739", "step": 1368, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:49.664039", "step": 1368, "epoch": 2 }, { "type": "pplx", "content": 33795719.590478756, "timestamp": "2025-09-30 22:11:49.666611", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:49.719047", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.0031422681640833616, "timestamp": "2025-09-30 22:11:49.721231", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:49.775273", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.007872308604419231, "timestamp": "2025-09-30 22:11:49.778345", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:49.832827", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.011166645213961601, "timestamp": "2025-09-30 22:11:49.838286", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:49.892684", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.008558930829167366, "timestamp": "2025-09-30 22:11:49.899692", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:49.956402", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.01813081093132496, "timestamp": "2025-09-30 22:11:49.960747", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.015076", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.014572428539395332, "timestamp": "2025-09-30 22:11:50.017794", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:50.071943", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.025154201313853264, "timestamp": "2025-09-30 22:11:50.074765", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.134387", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.006909341551363468, "timestamp": "2025-09-30 22:11:50.143622", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:50.199261", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.024538232013583183, "timestamp": "2025-09-30 22:11:50.201749", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.258024", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.016891105100512505, "timestamp": "2025-09-30 22:11:50.261650", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.316502", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.02443709410727024, "timestamp": "2025-09-30 22:11:50.319617", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.373500", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.00553086306899786, "timestamp": "2025-09-30 22:11:50.381388", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.435108", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.02132841758430004, "timestamp": "2025-09-30 22:11:50.442804", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.500292", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.00981497298926115, "timestamp": "2025-09-30 22:11:50.503681", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.557827", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.0064003728330135345, "timestamp": "2025-09-30 22:11:50.561098", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.615611", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.0022823542822152376, "timestamp": "2025-09-30 22:11:50.622947", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.678363", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.02859966643154621, "timestamp": "2025-09-30 22:11:50.681709", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.735485", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.017722919583320618, "timestamp": "2025-09-30 22:11:50.738377", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.792609", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.03379681333899498, "timestamp": "2025-09-30 22:11:50.796363", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.849961", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.003414042294025421, "timestamp": "2025-09-30 22:11:50.859211", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:50.914429", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.014491712674498558, "timestamp": "2025-09-30 22:11:50.921126", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:50.977992", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.009958495385944843, "timestamp": "2025-09-30 22:11:50.980673", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:51.041085", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.006238664500415325, "timestamp": "2025-09-30 22:11:51.047289", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:51.105263", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.014195479452610016, "timestamp": "2025-09-30 22:11:51.118187", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.175096", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.00753815146163106, "timestamp": "2025-09-30 22:11:51.178464", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.234581", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.013055319897830486, "timestamp": "2025-09-30 22:11:51.237281", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.292813", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.004737398587167263, "timestamp": "2025-09-30 22:11:51.294813", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.348986", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.002370731672272086, "timestamp": "2025-09-30 22:11:51.357468", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.411158", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.002246677177026868, "timestamp": "2025-09-30 22:11:51.417173", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.475633", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.05915753170847893, "timestamp": "2025-09-30 22:11:51.481055", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.536694", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.007206898648291826, "timestamp": "2025-09-30 22:11:51.540164", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:51.596906", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.003330029547214508, "timestamp": "2025-09-30 22:11:51.604499", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.661337", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.01776135340332985, "timestamp": "2025-09-30 22:11:51.664234", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:51.719714", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.007816934958100319, "timestamp": "2025-09-30 22:11:51.722380", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.776606", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.006343926768749952, "timestamp": "2025-09-30 22:11:51.779877", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:51.834652", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.01646248623728752, "timestamp": "2025-09-30 22:11:51.842243", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.897403", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.011257813312113285, "timestamp": "2025-09-30 22:11:51.900431", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:51.953481", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.010497097857296467, "timestamp": "2025-09-30 22:11:51.959746", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:52.015068", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.002651577116921544, "timestamp": "2025-09-30 22:11:52.019159", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.072606", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.03266320377588272, "timestamp": "2025-09-30 22:11:52.085621", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:52.141617", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.01602781191468239, "timestamp": "2025-09-30 22:11:52.146954", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.203737", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.002372512361034751, "timestamp": "2025-09-30 22:11:52.207368", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.262628", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.014318128116428852, "timestamp": "2025-09-30 22:11:52.266816", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.323101", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.00335204997099936, "timestamp": "2025-09-30 22:11:52.333507", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.399815", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.006724204868078232, "timestamp": "2025-09-30 22:11:52.405354", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:52.460557", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.008304744958877563, "timestamp": "2025-09-30 22:11:52.470446", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:52.532792", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.013678102754056454, "timestamp": "2025-09-30 22:11:52.535675", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.590913", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.00877212081104517, "timestamp": "2025-09-30 22:11:52.596849", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.649257", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.0023188970517367125, "timestamp": "2025-09-30 22:11:52.651657", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.706530", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.002683415310457349, "timestamp": "2025-09-30 22:11:52.709269", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:52.772989", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.006297766696661711, "timestamp": "2025-09-30 22:11:52.775729", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.830863", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.018091067671775818, "timestamp": "2025-09-30 22:11:52.838563", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.892921", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.015514843165874481, "timestamp": "2025-09-30 22:11:52.895667", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:52.949542", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.0016685453010722995, "timestamp": "2025-09-30 22:11:52.959052", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:53.015770", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.04028955474495888, "timestamp": "2025-09-30 22:11:53.018535", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:53.074396", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.018086543306708336, "timestamp": "2025-09-30 22:11:53.081604", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:53.137519", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.03144695982336998, "timestamp": "2025-09-30 22:11:53.145979", "step": 1425, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:54.385788", "step": 1425, "epoch": 2 }, { "type": "pplx", "content": 32876480.431024157, "timestamp": "2025-09-30 22:11:54.388368", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.441452", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.0008492738706991076, "timestamp": "2025-09-30 22:11:54.444996", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.497804", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.006496089976280928, "timestamp": "2025-09-30 22:11:54.500486", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.554857", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.046025075018405914, "timestamp": "2025-09-30 22:11:54.561183", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:54.620775", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.004891328979283571, "timestamp": "2025-09-30 22:11:54.623529", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.678675", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.006063086446374655, "timestamp": "2025-09-30 22:11:54.689551", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.744569", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.015636909753084183, "timestamp": "2025-09-30 22:11:54.746833", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.801259", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.009860769845545292, "timestamp": "2025-09-30 22:11:54.812696", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:54.867412", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.021920878440141678, "timestamp": "2025-09-30 22:11:54.873928", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:54.933680", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.03438463434576988, "timestamp": "2025-09-30 22:11:54.935946", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:54.990181", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.010635143145918846, "timestamp": "2025-09-30 22:11:54.992804", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.055330", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.00384154194034636, "timestamp": "2025-09-30 22:11:55.062260", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.116261", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.01956865005195141, "timestamp": "2025-09-30 22:11:55.126079", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.184259", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.011690896935760975, "timestamp": "2025-09-30 22:11:55.187051", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.240826", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.0043997629545629025, "timestamp": "2025-09-30 22:11:55.242792", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.295661", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.0012786614242941141, "timestamp": "2025-09-30 22:11:55.301411", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.353838", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.001398958032950759, "timestamp": "2025-09-30 22:11:55.356227", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.412017", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.0023240819573402405, "timestamp": "2025-09-30 22:11:55.417275", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.470340", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.014607422053813934, "timestamp": "2025-09-30 22:11:55.475088", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.528302", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.001739148749038577, "timestamp": "2025-09-30 22:11:55.534596", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.591866", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.011579596437513828, "timestamp": "2025-09-30 22:11:55.595434", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.649148", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.01198033057153225, "timestamp": "2025-09-30 22:11:55.651642", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.705180", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.010683462955057621, "timestamp": "2025-09-30 22:11:55.707304", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.762754", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.010873474180698395, "timestamp": "2025-09-30 22:11:55.768493", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.821216", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.016803989186882973, "timestamp": "2025-09-30 22:11:55.824294", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:55.879996", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.024432357400655746, "timestamp": "2025-09-30 22:11:55.883382", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.937573", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.01734604313969612, "timestamp": "2025-09-30 22:11:55.941133", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:55.994416", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.002315593184903264, "timestamp": "2025-09-30 22:11:56.001288", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.055517", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.0013011764967814088, "timestamp": "2025-09-30 22:11:56.057536", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.111279", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.014267000369727612, "timestamp": "2025-09-30 22:11:56.115214", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.177103", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.01388590969145298, "timestamp": "2025-09-30 22:11:56.179331", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.231796", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.03596239164471626, "timestamp": "2025-09-30 22:11:56.238002", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.291186", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.008110507391393185, "timestamp": "2025-09-30 22:11:56.293601", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.346799", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.005775760859251022, "timestamp": "2025-09-30 22:11:56.348855", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.401731", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.02051071636378765, "timestamp": "2025-09-30 22:11:56.403960", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.457070", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.003823460778221488, "timestamp": "2025-09-30 22:11:56.463416", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.516251", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.05588651821017265, "timestamp": "2025-09-30 22:11:56.519240", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.572359", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.011291184462606907, "timestamp": "2025-09-30 22:11:56.574855", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.638559", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.02610655501484871, "timestamp": "2025-09-30 22:11:56.640892", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.694525", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.025093723088502884, "timestamp": "2025-09-30 22:11:56.702308", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:56.754189", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.018963143229484558, "timestamp": "2025-09-30 22:11:56.762237", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.815551", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.02996927499771118, "timestamp": "2025-09-30 22:11:56.818951", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.871563", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.031889282166957855, "timestamp": "2025-09-30 22:11:56.874567", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:56.929697", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.013061118312180042, "timestamp": "2025-09-30 22:11:56.938671", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:56.992670", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.021829981356859207, "timestamp": "2025-09-30 22:11:56.995200", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.048959", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.048463720828294754, "timestamp": "2025-09-30 22:11:57.051096", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:57.105679", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.0270835030823946, "timestamp": "2025-09-30 22:11:57.107848", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.160647", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.007283608429133892, "timestamp": "2025-09-30 22:11:57.166422", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:57.221551", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.014574305154383183, "timestamp": "2025-09-30 22:11:57.223680", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:57.280308", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.014809882268309593, "timestamp": "2025-09-30 22:11:57.282233", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:57.335185", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.010421657003462315, "timestamp": "2025-09-30 22:11:57.337900", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:57.391394", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.025253277271986008, "timestamp": "2025-09-30 22:11:57.399377", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:57.452018", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.005706873722374439, "timestamp": "2025-09-30 22:11:57.454324", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.507110", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.007750978227704763, "timestamp": "2025-09-30 22:11:57.510333", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.563595", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.0170641727745533, "timestamp": "2025-09-30 22:11:57.565865", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.618452", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.010642343200743198, "timestamp": "2025-09-30 22:11:57.624447", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:57.677415", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.022640392184257507, "timestamp": "2025-09-30 22:11:57.680487", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:57.733786", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.012234336696565151, "timestamp": "2025-09-30 22:11:57.736389", "step": 1482, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:11:58.929466", "step": 1482, "epoch": 2 }, { "type": "pplx", "content": 31872365.783038512, "timestamp": "2025-09-30 22:11:58.932319", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:58.984024", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.016802014783024788, "timestamp": "2025-09-30 22:11:58.986699", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.041050", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.010017321445047855, "timestamp": "2025-09-30 22:11:59.047373", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:59.103047", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.01991168037056923, "timestamp": "2025-09-30 22:11:59.106592", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.161064", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.02465466596186161, "timestamp": "2025-09-30 22:11:59.163538", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.218736", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.011366413906216621, "timestamp": "2025-09-30 22:11:59.221346", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.275277", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.009791248477995396, "timestamp": "2025-09-30 22:11:59.282523", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.335043", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.012804691679775715, "timestamp": "2025-09-30 22:11:59.337565", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:59.391128", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.008853507228195667, "timestamp": "2025-09-30 22:11:59.394664", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.448734", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.007088639307767153, "timestamp": "2025-09-30 22:11:59.451173", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.504600", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.017957476899027824, "timestamp": "2025-09-30 22:11:59.511239", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.564635", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.011476176790893078, "timestamp": "2025-09-30 22:11:59.567245", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:11:59.622806", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.014955559745430946, "timestamp": "2025-09-30 22:11:59.625363", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:11:59.678610", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.020908983424305916, "timestamp": "2025-09-30 22:11:59.681901", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.737652", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.007830632850527763, "timestamp": "2025-09-30 22:11:59.743947", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.797627", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.007764595095068216, "timestamp": "2025-09-30 22:11:59.800420", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:59.855257", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.01050377357751131, "timestamp": "2025-09-30 22:11:59.857838", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:11:59.913101", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.008915500715374947, "timestamp": "2025-09-30 22:11:59.915703", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:11:59.971399", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.015954632312059402, "timestamp": "2025-09-30 22:11:59.977693", "step": 1500, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-09-30 22:12:00.388871", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:00.447231", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.011480267159640789, "timestamp": "2025-09-30 22:12:00.449372", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.503242", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.010660061612725258, "timestamp": "2025-09-30 22:12:00.505412", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.559660", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.00754969660192728, "timestamp": "2025-09-30 22:12:00.562034", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.619455", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.006606790237128735, "timestamp": "2025-09-30 22:12:00.625333", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:00.684398", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.024877065792679787, "timestamp": "2025-09-30 22:12:00.686471", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.740071", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.018199782818555832, "timestamp": "2025-09-30 22:12:00.742556", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.798637", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.024665992707014084, "timestamp": "2025-09-30 22:12:00.800950", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-30 22:12:00.860320", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.010990189388394356, "timestamp": "2025-09-30 22:12:00.866243", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.919912", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.011042303405702114, "timestamp": "2025-09-30 22:12:00.922231", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:00.976152", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.016723833978176117, "timestamp": "2025-09-30 22:12:00.978834", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:01.033142", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.0033279487397521734, "timestamp": "2025-09-30 22:12:01.035168", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.088589", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.012400150299072266, "timestamp": "2025-09-30 22:12:01.094521", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.150517", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.011185402050614357, "timestamp": "2025-09-30 22:12:01.153351", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:01.207943", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.01125926524400711, "timestamp": "2025-09-30 22:12:01.210391", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.265310", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.009529463946819305, "timestamp": "2025-09-30 22:12:01.267633", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:01.329492", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.03866703063249588, "timestamp": "2025-09-30 22:12:01.335633", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.393980", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.009177983738481998, "timestamp": "2025-09-30 22:12:01.396395", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.453397", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.02175569161772728, "timestamp": "2025-09-30 22:12:01.455615", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.508806", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.012183894403278828, "timestamp": "2025-09-30 22:12:01.510982", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.566324", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.008869107812643051, "timestamp": "2025-09-30 22:12:01.574362", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.628660", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.021661145612597466, "timestamp": "2025-09-30 22:12:01.631165", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.691533", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.019203439354896545, "timestamp": "2025-09-30 22:12:01.695020", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:01.749280", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.010725765489041805, "timestamp": "2025-09-30 22:12:01.751519", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.806614", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.009876924566924572, "timestamp": "2025-09-30 22:12:01.812145", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:01.867986", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.011706030927598476, "timestamp": "2025-09-30 22:12:01.870018", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:01.923199", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.011448273435235023, "timestamp": "2025-09-30 22:12:01.925038", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:01.977941", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.009272503666579723, "timestamp": "2025-09-30 22:12:01.979995", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.033727", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.05378647893667221, "timestamp": "2025-09-30 22:12:02.039275", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.091647", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.024520421400666237, "timestamp": "2025-09-30 22:12:02.093742", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.146154", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.024694928899407387, "timestamp": "2025-09-30 22:12:02.148676", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.203002", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.007078372407704592, "timestamp": "2025-09-30 22:12:02.205450", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.259443", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.030642852187156677, "timestamp": "2025-09-30 22:12:02.265928", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.319607", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.016413046047091484, "timestamp": "2025-09-30 22:12:02.321885", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:02.377357", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.026809586212038994, "timestamp": "2025-09-30 22:12:02.379738", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.434324", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.02859729900956154, "timestamp": "2025-09-30 22:12:02.437108", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.492084", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.013830293901264668, "timestamp": "2025-09-30 22:12:02.497710", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:02.550441", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.01072558481246233, "timestamp": "2025-09-30 22:12:02.553119", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:02.612546", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.03165407106280327, "timestamp": "2025-09-30 22:12:02.614540", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:02.669484", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.006038249935954809, "timestamp": "2025-09-30 22:12:02.671470", "step": 1539, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:03.885059", "step": 1539, "epoch": 2 }, { "type": "pplx", "content": 35072222.03899579, "timestamp": "2025-09-30 22:12:03.886596", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:03.938357", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.019516736268997192, "timestamp": "2025-09-30 22:12:03.944029", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:03.997583", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.01919463463127613, "timestamp": "2025-09-30 22:12:03.999837", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.053989", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.02023499831557274, "timestamp": "2025-09-30 22:12:04.057050", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.109535", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.0022135619074106216, "timestamp": "2025-09-30 22:12:04.111673", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.164050", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.008789089508354664, "timestamp": "2025-09-30 22:12:04.170203", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.224306", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.013264109380543232, "timestamp": "2025-09-30 22:12:04.231236", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.286705", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.035324085503816605, "timestamp": "2025-09-30 22:12:04.291376", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.347304", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.028284629806876183, "timestamp": "2025-09-30 22:12:04.349678", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.405061", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.007114126812666655, "timestamp": "2025-09-30 22:12:04.410655", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:04.464055", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.0037361420691013336, "timestamp": "2025-09-30 22:12:04.466154", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.519128", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.00416518421843648, "timestamp": "2025-09-30 22:12:04.521220", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.574047", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.012675133533775806, "timestamp": "2025-09-30 22:12:04.576126", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:04.629683", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.018537061288952827, "timestamp": "2025-09-30 22:12:04.635243", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.688065", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.010581501759588718, "timestamp": "2025-09-30 22:12:04.690083", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:04.743054", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.0007142522372305393, "timestamp": "2025-09-30 22:12:04.745197", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.798016", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.03923966363072395, "timestamp": "2025-09-30 22:12:04.800205", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.854036", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.02381220832467079, "timestamp": "2025-09-30 22:12:04.859829", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.912029", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.002656190888956189, "timestamp": "2025-09-30 22:12:04.914059", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:04.969018", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.009249404072761536, "timestamp": "2025-09-30 22:12:04.971667", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.026246", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.013429306447505951, "timestamp": "2025-09-30 22:12:05.028394", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.081243", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.0022556984331458807, "timestamp": "2025-09-30 22:12:05.086891", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.139568", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.039768513292074203, "timestamp": "2025-09-30 22:12:05.141838", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.194858", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.002185255754739046, "timestamp": "2025-09-30 22:12:05.197796", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.250587", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.03695518150925636, "timestamp": "2025-09-30 22:12:05.253676", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.309805", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.006958664394915104, "timestamp": "2025-09-30 22:12:05.315287", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.367430", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.001130191725678742, "timestamp": "2025-09-30 22:12:05.370117", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:05.423414", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.0016752462834119797, "timestamp": "2025-09-30 22:12:05.425530", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:05.477789", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.0018236959585919976, "timestamp": "2025-09-30 22:12:05.480120", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.532586", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.022834938019514084, "timestamp": "2025-09-30 22:12:05.538287", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.590319", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.006333382334560156, "timestamp": "2025-09-30 22:12:05.592495", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.644948", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.012706826440989971, "timestamp": "2025-09-30 22:12:05.647109", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.699654", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.02377459593117237, "timestamp": "2025-09-30 22:12:05.701745", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.754334", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.011410296894609928, "timestamp": "2025-09-30 22:12:05.760192", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.815069", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.004736430011689663, "timestamp": "2025-09-30 22:12:05.817268", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:05.869880", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.001076001557521522, "timestamp": "2025-09-30 22:12:05.872137", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.925587", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.023327454924583435, "timestamp": "2025-09-30 22:12:05.927859", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:05.981165", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.0017735332949087024, "timestamp": "2025-09-30 22:12:05.987084", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.039957", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.010920924134552479, "timestamp": "2025-09-30 22:12:06.042225", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.096214", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.028373869135975838, "timestamp": "2025-09-30 22:12:06.098273", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.152111", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.0025909452233463526, "timestamp": "2025-09-30 22:12:06.154864", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.209637", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.01281531248241663, "timestamp": "2025-09-30 22:12:06.216014", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:06.269328", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.01906949281692505, "timestamp": "2025-09-30 22:12:06.272333", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.326030", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.008657933212816715, "timestamp": "2025-09-30 22:12:06.328046", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.380427", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.011052996851503849, "timestamp": "2025-09-30 22:12:06.382745", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.436203", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.008702821098268032, "timestamp": "2025-09-30 22:12:06.441965", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.493780", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.01199591439217329, "timestamp": "2025-09-30 22:12:06.495960", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.548081", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.02052219584584236, "timestamp": "2025-09-30 22:12:06.550557", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:06.603416", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.029241319745779037, "timestamp": "2025-09-30 22:12:06.605525", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.657839", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.004892691969871521, "timestamp": "2025-09-30 22:12:06.663407", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.727359", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.021011924371123314, "timestamp": "2025-09-30 22:12:06.729566", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.782007", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.01708308607339859, "timestamp": "2025-09-30 22:12:06.784478", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.837235", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.035234589129686356, "timestamp": "2025-09-30 22:12:06.839857", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.892589", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.047162704169750214, "timestamp": "2025-09-30 22:12:06.898404", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:06.950346", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.010256440378725529, "timestamp": "2025-09-30 22:12:06.952596", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:07.005219", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.011917630210518837, "timestamp": "2025-09-30 22:12:07.007233", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:07.062589", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.019979296252131462, "timestamp": "2025-09-30 22:12:07.064654", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:07.117139", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.020958244800567627, "timestamp": "2025-09-30 22:12:07.122974", "step": 1596, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:08.315114", "step": 1596, "epoch": 2 }, { "type": "pplx", "content": 33021765.354655504, "timestamp": "2025-09-30 22:12:08.317184", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:08.368251", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.00980797503143549, "timestamp": "2025-09-30 22:12:08.370313", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:08.433892", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.009293398819863796, "timestamp": "2025-09-30 22:12:08.435957", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:08.488754", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.015030805952847004, "timestamp": "2025-09-30 22:12:08.491114", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:08.549008", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.011158740147948265, "timestamp": "2025-09-30 22:12:08.554545", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:08.610775", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.005423234310001135, "timestamp": "2025-09-30 22:12:08.612759", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:08.666779", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.02317948266863823, "timestamp": "2025-09-30 22:12:08.668958", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:08.722140", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.01775677688419819, "timestamp": "2025-09-30 22:12:08.724860", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:08.791960", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.0035264005418866873, "timestamp": "2025-09-30 22:12:08.798017", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:08.851106", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.006642151158303022, "timestamp": "2025-09-30 22:12:08.853289", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:08.906804", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.026895778253674507, "timestamp": "2025-09-30 22:12:08.909344", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:08.964905", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.008761496283113956, "timestamp": "2025-09-30 22:12:08.967384", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.020829", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.011370251886546612, "timestamp": "2025-09-30 22:12:09.026859", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:09.079870", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.007803045213222504, "timestamp": "2025-09-30 22:12:09.081910", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.135803", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.02594866044819355, "timestamp": "2025-09-30 22:12:09.138287", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:09.191233", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.03655938431620598, "timestamp": "2025-09-30 22:12:09.194673", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.250578", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.005339773837476969, "timestamp": "2025-09-30 22:12:09.258772", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:09.313826", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.013894207775592804, "timestamp": "2025-09-30 22:12:09.316505", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.370619", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.005706585478037596, "timestamp": "2025-09-30 22:12:09.373401", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.427451", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.005220034625381231, "timestamp": "2025-09-30 22:12:09.430066", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.483919", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.02249130979180336, "timestamp": "2025-09-30 22:12:09.490894", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.545619", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.024420803412795067, "timestamp": "2025-09-30 22:12:09.548516", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:09.601846", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.006213244050741196, "timestamp": "2025-09-30 22:12:09.604007", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:09.657294", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.010518096387386322, "timestamp": "2025-09-30 22:12:09.660390", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:09.715056", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.016626330092549324, "timestamp": "2025-09-30 22:12:09.721021", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.778310", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.010907611809670925, "timestamp": "2025-09-30 22:12:09.780560", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:09.835444", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.010932761244475842, "timestamp": "2025-09-30 22:12:09.838341", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:09.891989", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.002364618005231023, "timestamp": "2025-09-30 22:12:09.894614", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:09.949021", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.0025081944186240435, "timestamp": "2025-09-30 22:12:09.955528", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.012894", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.011359497904777527, "timestamp": "2025-09-30 22:12:10.015406", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.068841", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.0060988375917077065, "timestamp": "2025-09-30 22:12:10.071461", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.127759", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.05257219076156616, "timestamp": "2025-09-30 22:12:10.130388", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:10.185216", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.012155899778008461, "timestamp": "2025-09-30 22:12:10.192809", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.248137", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.01888015680015087, "timestamp": "2025-09-30 22:12:10.251948", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.310545", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.011508808471262455, "timestamp": "2025-09-30 22:12:10.314123", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.369979", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.009577545337378979, "timestamp": "2025-09-30 22:12:10.372814", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:10.427544", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.022420035675168037, "timestamp": "2025-09-30 22:12:10.433549", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.487384", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.013017824850976467, "timestamp": "2025-09-30 22:12:10.490027", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.543592", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.006547243800014257, "timestamp": "2025-09-30 22:12:10.545311", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.603149", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.016644669696688652, "timestamp": "2025-09-30 22:12:10.604931", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.660831", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.009076499380171299, "timestamp": "2025-09-30 22:12:10.666106", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:10.719634", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.0293984804302454, "timestamp": "2025-09-30 22:12:10.721835", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.776206", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.001451183925382793, "timestamp": "2025-09-30 22:12:10.778315", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.832230", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.002988268854096532, "timestamp": "2025-09-30 22:12:10.834016", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.888355", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.007970958016812801, "timestamp": "2025-09-30 22:12:10.893965", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:10.947408", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.026539817452430725, "timestamp": "2025-09-30 22:12:10.949401", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.003469", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.017281271517276764, "timestamp": "2025-09-30 22:12:11.005352", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:11.067465", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.008392428047955036, "timestamp": "2025-09-30 22:12:11.069643", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:11.125032", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.01337872352451086, "timestamp": "2025-09-30 22:12:11.130293", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.183409", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.005623048637062311, "timestamp": "2025-09-30 22:12:11.186127", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.241311", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.009559988044202328, "timestamp": "2025-09-30 22:12:11.243866", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.296841", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.010462108068168163, "timestamp": "2025-09-30 22:12:11.299819", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.354919", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.008628972806036472, "timestamp": "2025-09-30 22:12:11.360579", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.413579", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.017333444207906723, "timestamp": "2025-09-30 22:12:11.415784", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:11.473420", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.007165636867284775, "timestamp": "2025-09-30 22:12:11.475820", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:11.529599", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.005024661775678396, "timestamp": "2025-09-30 22:12:11.531962", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:11.586225", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.003687667427584529, "timestamp": "2025-09-30 22:12:11.591545", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:11.644033", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.01683473400771618, "timestamp": "2025-09-30 22:12:11.645842", "step": 1653, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:12.861767", "step": 1653, "epoch": 2 }, { "type": "pplx", "content": 31819411.36838668, "timestamp": "2025-09-30 22:12:12.875645", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:12.927653", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.019419243559241295, "timestamp": "2025-09-30 22:12:12.929830", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:12.982438", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.004655472934246063, "timestamp": "2025-09-30 22:12:12.984630", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.038205", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.0069176494143903255, "timestamp": "2025-09-30 22:12:13.044005", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.096761", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.0254069771617651, "timestamp": "2025-09-30 22:12:13.098612", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:13.151554", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.02898487262427807, "timestamp": "2025-09-30 22:12:13.153772", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:13.207387", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.010060988366603851, "timestamp": "2025-09-30 22:12:13.210551", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.264259", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.004005379509180784, "timestamp": "2025-09-30 22:12:13.272363", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:13.334659", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.043611329048871994, "timestamp": "2025-09-30 22:12:13.337118", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:13.389890", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.007646446116268635, "timestamp": "2025-09-30 22:12:13.391929", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.445571", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.008269752375781536, "timestamp": "2025-09-30 22:12:13.447687", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:13.503140", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.003249566303566098, "timestamp": "2025-09-30 22:12:13.508527", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.561133", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.010090269148349762, "timestamp": "2025-09-30 22:12:13.562734", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.615328", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.005579036194831133, "timestamp": "2025-09-30 22:12:13.616959", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:13.670719", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.023332465440034866, "timestamp": "2025-09-30 22:12:13.672956", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:13.726365", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.03554206341505051, "timestamp": "2025-09-30 22:12:13.732007", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.785078", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.059974852949380875, "timestamp": "2025-09-30 22:12:13.787293", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:13.840026", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.00815680343657732, "timestamp": "2025-09-30 22:12:13.842053", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:13.897501", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.013160581700503826, "timestamp": "2025-09-30 22:12:13.899928", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:13.953678", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.011886881664395332, "timestamp": "2025-09-30 22:12:13.959315", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:14.016600", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.011096789501607418, "timestamp": "2025-09-30 22:12:14.018598", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.072022", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.002368275774642825, "timestamp": "2025-09-30 22:12:14.074112", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:14.128129", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.00044797913869842887, "timestamp": "2025-09-30 22:12:14.130228", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:14.184419", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.0028583400417119265, "timestamp": "2025-09-30 22:12:14.190607", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.244200", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.008487415499985218, "timestamp": "2025-09-30 22:12:14.246938", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.299946", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.00515100359916687, "timestamp": "2025-09-30 22:12:14.302158", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:14.358300", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.0111812399700284, "timestamp": "2025-09-30 22:12:14.360428", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.413857", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.018307924270629883, "timestamp": "2025-09-30 22:12:14.419308", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.474616", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.005157233215868473, "timestamp": "2025-09-30 22:12:14.476856", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.533033", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.006048670504242182, "timestamp": "2025-09-30 22:12:14.535241", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.589464", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.005315546877682209, "timestamp": "2025-09-30 22:12:14.592376", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.653180", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.004861609544605017, "timestamp": "2025-09-30 22:12:14.658824", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.711677", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.03198011592030525, "timestamp": "2025-09-30 22:12:14.713847", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.766204", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.006582505535334349, "timestamp": "2025-09-30 22:12:14.768519", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:14.824037", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.041787777096033096, "timestamp": "2025-09-30 22:12:14.826140", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:14.884293", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.05907066911458969, "timestamp": "2025-09-30 22:12:14.890039", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:14.944929", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.001978454412892461, "timestamp": "2025-09-30 22:12:14.947025", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.002116", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.052878353744745255, "timestamp": "2025-09-30 22:12:15.004343", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.059424", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.0005954128573648632, "timestamp": "2025-09-30 22:12:15.061543", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:15.114789", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.02634171023964882, "timestamp": "2025-09-30 22:12:15.121913", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:15.177756", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.013122417032718658, "timestamp": "2025-09-30 22:12:15.179846", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:15.234822", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.008947016671299934, "timestamp": "2025-09-30 22:12:15.238000", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.292395", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.0031593344174325466, "timestamp": "2025-09-30 22:12:15.295329", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:15.350920", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.05288988724350929, "timestamp": "2025-09-30 22:12:15.356438", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.411357", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.005359127651900053, "timestamp": "2025-09-30 22:12:15.413335", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.471260", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.011889653280377388, "timestamp": "2025-09-30 22:12:15.473427", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:15.532562", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.0044033522717654705, "timestamp": "2025-09-30 22:12:15.534884", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.598256", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.016879115253686905, "timestamp": "2025-09-30 22:12:15.604579", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:15.657992", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.02566424012184143, "timestamp": "2025-09-30 22:12:15.660020", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.713446", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.0031795850954949856, "timestamp": "2025-09-30 22:12:15.715503", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:15.769623", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.003098678309470415, "timestamp": "2025-09-30 22:12:15.771959", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.828493", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.0025204757694154978, "timestamp": "2025-09-30 22:12:15.833993", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.890623", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.007822325453162193, "timestamp": "2025-09-30 22:12:15.892874", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:15.945573", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.017046038061380386, "timestamp": "2025-09-30 22:12:15.947579", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:16.015154", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.0030831003095954657, "timestamp": "2025-09-30 22:12:16.017231", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:16.072731", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.00840392243117094, "timestamp": "2025-09-30 22:12:16.078330", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:16.130875", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.011422915384173393, "timestamp": "2025-09-30 22:12:16.133009", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:16.189790", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.004428292624652386, "timestamp": "2025-09-30 22:12:16.191961", "step": 1710, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:17.490807", "step": 1710, "epoch": 2 }, { "type": "pplx", "content": 28349924.714862473, "timestamp": "2025-09-30 22:12:17.492915", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.549103", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.024888822808861732, "timestamp": "2025-09-30 22:12:17.552103", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.607307", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.022045819088816643, "timestamp": "2025-09-30 22:12:17.613677", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.671421", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.006401827093213797, "timestamp": "2025-09-30 22:12:17.673761", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.729271", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.007945166900753975, "timestamp": "2025-09-30 22:12:17.733538", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:17.795084", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.006832032930105925, "timestamp": "2025-09-30 22:12:17.797623", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.854066", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.010955681093037128, "timestamp": "2025-09-30 22:12:17.860042", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.913039", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.025570230558514595, "timestamp": "2025-09-30 22:12:17.917842", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:17.972577", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.013870848342776299, "timestamp": "2025-09-30 22:12:17.977653", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:18.031960", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.018312016502022743, "timestamp": "2025-09-30 22:12:18.034566", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.098990", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.016707230359315872, "timestamp": "2025-09-30 22:12:18.104901", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.169517", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.010144525207579136, "timestamp": "2025-09-30 22:12:18.171798", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.233785", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.011039652861654758, "timestamp": "2025-09-30 22:12:18.237053", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.305230", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.01270805113017559, "timestamp": "2025-09-30 22:12:18.313241", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.368284", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.00563334533944726, "timestamp": "2025-09-30 22:12:18.375460", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:18.432695", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.005631479900330305, "timestamp": "2025-09-30 22:12:18.440294", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:18.497729", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.009919635951519012, "timestamp": "2025-09-30 22:12:18.505164", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:18.560365", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.023495439440011978, "timestamp": "2025-09-30 22:12:18.564363", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.620110", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.008099724538624287, "timestamp": "2025-09-30 22:12:18.626786", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.683795", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.009610223583877087, "timestamp": "2025-09-30 22:12:18.697005", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.753105", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.009814736433327198, "timestamp": "2025-09-30 22:12:18.758237", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:18.821455", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.02346763201057911, "timestamp": "2025-09-30 22:12:18.835026", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:18.902723", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.0012323985574766994, "timestamp": "2025-09-30 22:12:18.909790", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:18.965608", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.010308587923645973, "timestamp": "2025-09-30 22:12:18.967963", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:19.025664", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.015463477931916714, "timestamp": "2025-09-30 22:12:19.028111", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.082060", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.013520815409719944, "timestamp": "2025-09-30 22:12:19.086364", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.142929", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.007343901786953211, "timestamp": "2025-09-30 22:12:19.149616", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.207407", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.012418312020599842, "timestamp": "2025-09-30 22:12:19.209506", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:19.267575", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.019974172115325928, "timestamp": "2025-09-30 22:12:19.270043", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:19.329102", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.005352269392460585, "timestamp": "2025-09-30 22:12:19.333119", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:19.393917", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.007415872532874346, "timestamp": "2025-09-30 22:12:19.404366", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:19.465342", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.020012961700558662, "timestamp": "2025-09-30 22:12:19.467678", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:19.526733", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.011243684217333794, "timestamp": "2025-09-30 22:12:19.528909", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.599075", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.023677440360188484, "timestamp": "2025-09-30 22:12:19.601646", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:19.655231", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.023243535310029984, "timestamp": "2025-09-30 22:12:19.661376", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:19.730048", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.009879850782454014, "timestamp": "2025-09-30 22:12:19.732022", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.786102", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.006921069230884314, "timestamp": "2025-09-30 22:12:19.788498", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.853434", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.004081842955201864, "timestamp": "2025-09-30 22:12:19.855443", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.910691", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.004515796434134245, "timestamp": "2025-09-30 22:12:19.916269", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:19.973698", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.005187832750380039, "timestamp": "2025-09-30 22:12:19.975788", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.030734", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.009365570731461048, "timestamp": "2025-09-30 22:12:20.036962", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:20.099843", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.014311355538666248, "timestamp": "2025-09-30 22:12:20.101958", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:20.163735", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.0038259513676166534, "timestamp": "2025-09-30 22:12:20.169460", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.222660", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.011391245760023594, "timestamp": "2025-09-30 22:12:20.224850", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:20.282986", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.03851420059800148, "timestamp": "2025-09-30 22:12:20.285579", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.347897", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.008675575256347656, "timestamp": "2025-09-30 22:12:20.349959", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.405526", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.016350556164979935, "timestamp": "2025-09-30 22:12:20.411169", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:20.465248", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.0020395752508193254, "timestamp": "2025-09-30 22:12:20.468495", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.527627", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.007115581072866917, "timestamp": "2025-09-30 22:12:20.530931", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.588081", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.008915326558053493, "timestamp": "2025-09-30 22:12:20.590168", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.648323", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.01385872345417738, "timestamp": "2025-09-30 22:12:20.653777", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.710857", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.019434064626693726, "timestamp": "2025-09-30 22:12:20.712841", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:20.768129", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.001426653703674674, "timestamp": "2025-09-30 22:12:20.770139", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:20.826674", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.000667016429360956, "timestamp": "2025-09-30 22:12:20.828901", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:20.884903", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.012331242673099041, "timestamp": "2025-09-30 22:12:20.891604", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:20.946165", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.008639157749712467, "timestamp": "2025-09-30 22:12:20.948392", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:21.015661", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.001175249577499926, "timestamp": "2025-09-30 22:12:21.017767", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:21.074161", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.0008622457389719784, "timestamp": "2025-09-30 22:12:21.076962", "step": 1767, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:22.508631", "step": 1767, "epoch": 2 }, { "type": "pplx", "content": 30314827.130945917, "timestamp": "2025-09-30 22:12:22.511215", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:22.574951", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.001509829773567617, "timestamp": "2025-09-30 22:12:22.581148", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:22.653620", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.011697669513523579, "timestamp": "2025-09-30 22:12:22.655679", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:22.719592", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.0038434225134551525, "timestamp": "2025-09-30 22:12:22.721767", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:22.779999", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.010291250422596931, "timestamp": "2025-09-30 22:12:22.782196", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:22.855201", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.00038557566585950553, "timestamp": "2025-09-30 22:12:22.861700", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:22.925889", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.002135960618034005, "timestamp": "2025-09-30 22:12:22.928233", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:22.983098", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.022244064137339592, "timestamp": "2025-09-30 22:12:22.990669", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:23.047551", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.021448533982038498, "timestamp": "2025-09-30 22:12:23.050681", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.126710", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.005585017614066601, "timestamp": "2025-09-30 22:12:23.132632", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.209022", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.0050450703129172325, "timestamp": "2025-09-30 22:12:23.214519", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.281367", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.006257961504161358, "timestamp": "2025-09-30 22:12:23.284211", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:23.358645", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.0036130433436483145, "timestamp": "2025-09-30 22:12:23.360773", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.429914", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.0057233721017837524, "timestamp": "2025-09-30 22:12:23.435998", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.492874", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.001503689563833177, "timestamp": "2025-09-30 22:12:23.495169", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.551669", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.01165227685123682, "timestamp": "2025-09-30 22:12:23.571755", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:23.661283", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.0008390057482756674, "timestamp": "2025-09-30 22:12:23.678023", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.740549", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.0012339332606643438, "timestamp": "2025-09-30 22:12:23.756039", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.820029", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.0013973814202472568, "timestamp": "2025-09-30 22:12:23.828115", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:23.902989", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.0011909554013982415, "timestamp": "2025-09-30 22:12:23.907395", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:23.984498", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.020810788497328758, "timestamp": "2025-09-30 22:12:23.989340", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.047666", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.003498237580060959, "timestamp": "2025-09-30 22:12:24.056749", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.128102", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.0002930622431449592, "timestamp": "2025-09-30 22:12:24.135350", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.208793", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.0038824868388473988, "timestamp": "2025-09-30 22:12:24.213275", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.272697", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.0029304868075996637, "timestamp": "2025-09-30 22:12:24.275392", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.361640", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.008040046319365501, "timestamp": "2025-09-30 22:12:24.373017", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.447033", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.005316526163369417, "timestamp": "2025-09-30 22:12:24.450440", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:24.525425", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.0005331950960680842, "timestamp": "2025-09-30 22:12:24.532931", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.598646", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.0008259565802291036, "timestamp": "2025-09-30 22:12:24.605028", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.665867", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.005228062160313129, "timestamp": "2025-09-30 22:12:24.674437", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:24.742664", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.025880755856633186, "timestamp": "2025-09-30 22:12:24.748436", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.806826", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.04264013096690178, "timestamp": "2025-09-30 22:12:24.813400", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.875741", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.03723777085542679, "timestamp": "2025-09-30 22:12:24.881371", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:24.943188", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.028541741892695427, "timestamp": "2025-09-30 22:12:24.950398", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:25.009304", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.000658934935927391, "timestamp": "2025-09-30 22:12:25.014125", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.076617", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.00514825526624918, "timestamp": "2025-09-30 22:12:25.081465", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.140816", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.01359549630433321, "timestamp": "2025-09-30 22:12:25.144175", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:25.209586", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.003553016809746623, "timestamp": "2025-09-30 22:12:25.216248", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.276635", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.01718035340309143, "timestamp": "2025-09-30 22:12:25.280427", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.339761", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.0026898744981735945, "timestamp": "2025-09-30 22:12:25.343047", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.411331", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.029336009174585342, "timestamp": "2025-09-30 22:12:25.414050", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.476066", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.0065011694096028805, "timestamp": "2025-09-30 22:12:25.485199", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:25.550418", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.004351469222456217, "timestamp": "2025-09-30 22:12:25.556302", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:25.623912", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.009557033888995647, "timestamp": "2025-09-30 22:12:25.626657", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.686205", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.021093687042593956, "timestamp": "2025-09-30 22:12:25.692721", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.751652", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.007726198993623257, "timestamp": "2025-09-30 22:12:25.761738", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.826331", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.006598359905183315, "timestamp": "2025-09-30 22:12:25.828662", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:25.892888", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.007870987989008427, "timestamp": "2025-09-30 22:12:25.896860", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:25.952583", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.011960902251303196, "timestamp": "2025-09-30 22:12:25.959745", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:26.032617", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.002732042223215103, "timestamp": "2025-09-30 22:12:26.040178", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:26.098524", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.0035709214862436056, "timestamp": "2025-09-30 22:12:26.102122", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.162259", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.03033597581088543, "timestamp": "2025-09-30 22:12:26.168318", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.234789", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.04264497384428978, "timestamp": "2025-09-30 22:12:26.242009", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.300287", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.008137590251863003, "timestamp": "2025-09-30 22:12:26.307528", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.374224", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.004997264593839645, "timestamp": "2025-09-30 22:12:26.384862", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.448877", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.0041570719331502914, "timestamp": "2025-09-30 22:12:26.453045", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:26.526430", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.004612022079527378, "timestamp": "2025-09-30 22:12:26.530133", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:26.591520", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.024884754791855812, "timestamp": "2025-09-30 22:12:26.598978", "step": 1824, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:28.095332", "step": 1824, "epoch": 2 }, { "type": "pplx", "content": 30672189.36827617, "timestamp": "2025-09-30 22:12:28.100771", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.170601", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.00739399716258049, "timestamp": "2025-09-30 22:12:28.173632", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.241236", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.01240342017263174, "timestamp": "2025-09-30 22:12:28.251710", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:28.317530", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.0024557800497859716, "timestamp": "2025-09-30 22:12:28.328228", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:28.386913", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.013910613022744656, "timestamp": "2025-09-30 22:12:28.394292", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:28.450865", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.001578363822773099, "timestamp": "2025-09-30 22:12:28.453690", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.511246", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.04201820492744446, "timestamp": "2025-09-30 22:12:28.517272", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:28.576318", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.0008280682959593832, "timestamp": "2025-09-30 22:12:28.578521", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.649280", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.006336410064250231, "timestamp": "2025-09-30 22:12:28.661730", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.742433", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.005034334491938353, "timestamp": "2025-09-30 22:12:28.746912", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:28.836112", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.013774161227047443, "timestamp": "2025-09-30 22:12:28.839488", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:28.905166", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.06970737874507904, "timestamp": "2025-09-30 22:12:28.913773", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:28.974737", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 0.053864989429712296, "timestamp": "2025-09-30 22:12:28.983214", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.050549", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 0.0217081718146801, "timestamp": "2025-09-30 22:12:29.052793", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.121315", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 0.013128918595612049, "timestamp": "2025-09-30 22:12:29.123710", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.188599", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 0.057358644902706146, "timestamp": "2025-09-30 22:12:29.191407", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.249536", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 0.03622815012931824, "timestamp": "2025-09-30 22:12:29.255911", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:29.323306", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.027150630950927734, "timestamp": "2025-09-30 22:12:29.325939", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.391564", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.002653220435604453, "timestamp": "2025-09-30 22:12:29.398259", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.482420", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 0.0013348475331440568, "timestamp": "2025-09-30 22:12:29.485991", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.544185", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 0.029313331469893456, "timestamp": "2025-09-30 22:12:29.549874", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.621496", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 0.007896478287875652, "timestamp": "2025-09-30 22:12:29.625555", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.689604", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 0.0026143400464206934, "timestamp": "2025-09-30 22:12:29.696347", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.758025", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.01978653483092785, "timestamp": "2025-09-30 22:12:29.761214", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.827132", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.028982680290937424, "timestamp": "2025-09-30 22:12:29.833733", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.893636", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 0.03256614878773689, "timestamp": "2025-09-30 22:12:29.898393", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:29.963923", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 0.010735915042459965, "timestamp": "2025-09-30 22:12:29.967708", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.025959", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.011695819906890392, "timestamp": "2025-09-30 22:12:30.029295", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.084003", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.006875795312225819, "timestamp": "2025-09-30 22:12:30.093480", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:30.151820", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.014828616753220558, "timestamp": "2025-09-30 22:12:30.157422", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.218984", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 0.010935215279459953, "timestamp": "2025-09-30 22:12:30.222139", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:30.280694", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 0.011751067824661732, "timestamp": "2025-09-30 22:12:30.283823", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.342542", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 0.03328597545623779, "timestamp": "2025-09-30 22:12:30.349244", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.427820", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.015032247640192509, "timestamp": "2025-09-30 22:12:30.431664", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.487533", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 0.014371916651725769, "timestamp": "2025-09-30 22:12:30.495788", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.578734", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 0.010754152201116085, "timestamp": "2025-09-30 22:12:30.584875", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:30.647369", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 0.01473289541900158, "timestamp": "2025-09-30 22:12:30.654052", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:30.712046", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.008067458868026733, "timestamp": "2025-09-30 22:12:30.714880", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.774679", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.02813909575343132, "timestamp": "2025-09-30 22:12:30.779834", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.841083", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.008334296755492687, "timestamp": "2025-09-30 22:12:30.852524", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.913530", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.015859607607126236, "timestamp": "2025-09-30 22:12:30.920050", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:30.976590", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.017760148271918297, "timestamp": "2025-09-30 22:12:30.982243", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.053340", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 0.024972213432192802, "timestamp": "2025-09-30 22:12:31.056911", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.125418", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 0.007451011333614588, "timestamp": "2025-09-30 22:12:31.128354", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.185780", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 0.023097878322005272, "timestamp": "2025-09-30 22:12:31.191973", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.247075", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 0.005390969570726156, "timestamp": "2025-09-30 22:12:31.250345", "step": 1869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.307936", "step": 1869, "epoch": 3 }, { "type": "loss", "content": 0.014087321236729622, "timestamp": "2025-09-30 22:12:31.322415", "step": 1870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.385612", "step": 1870, "epoch": 3 }, { "type": "loss", "content": 0.018397578969597816, "timestamp": "2025-09-30 22:12:31.390764", "step": 1871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.462449", "step": 1871, "epoch": 3 }, { "type": "loss", "content": 0.007025882601737976, "timestamp": "2025-09-30 22:12:31.480345", "step": 1872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:31.536704", "step": 1872, "epoch": 3 }, { "type": "loss", "content": 0.011556877754628658, "timestamp": "2025-09-30 22:12:31.541459", "step": 1873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.605457", "step": 1873, "epoch": 3 }, { "type": "loss", "content": 0.01839577779173851, "timestamp": "2025-09-30 22:12:31.608900", "step": 1874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.680668", "step": 1874, "epoch": 3 }, { "type": "loss", "content": 0.010417903773486614, "timestamp": "2025-09-30 22:12:31.690146", "step": 1875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:31.744977", "step": 1875, "epoch": 3 }, { "type": "loss", "content": 0.009947366081178188, "timestamp": "2025-09-30 22:12:31.751963", "step": 1876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.820158", "step": 1876, "epoch": 3 }, { "type": "loss", "content": 0.012040347792208195, "timestamp": "2025-09-30 22:12:31.822535", "step": 1877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:31.890594", "step": 1877, "epoch": 3 }, { "type": "loss", "content": 0.0012225349200889468, "timestamp": "2025-09-30 22:12:31.894112", "step": 1878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:31.949728", "step": 1878, "epoch": 3 }, { "type": "loss", "content": 0.006086386274546385, "timestamp": "2025-09-30 22:12:31.953732", "step": 1879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:32.009116", "step": 1879, "epoch": 3 }, { "type": "loss", "content": 0.00934376660734415, "timestamp": "2025-09-30 22:12:32.016702", "step": 1880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:32.071527", "step": 1880, "epoch": 3 }, { "type": "loss", "content": 0.009230917319655418, "timestamp": "2025-09-30 22:12:32.073939", "step": 1881, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:33.529057", "step": 1881, "epoch": 3 }, { "type": "pplx", "content": 29811204.71677049, "timestamp": "2025-09-30 22:12:33.532880", "step": 1881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:33.589718", "step": 1881, "epoch": 3 }, { "type": "loss", "content": 0.002237612148746848, "timestamp": "2025-09-30 22:12:33.597449", "step": 1882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:33.654864", "step": 1882, "epoch": 3 }, { "type": "loss", "content": 0.0021926003973931074, "timestamp": "2025-09-30 22:12:33.657646", "step": 1883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:33.719097", "step": 1883, "epoch": 3 }, { "type": "loss", "content": 0.007021576166152954, "timestamp": "2025-09-30 22:12:33.725938", "step": 1884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:33.787887", "step": 1884, "epoch": 3 }, { "type": "loss", "content": 0.002189788268879056, "timestamp": "2025-09-30 22:12:33.791754", "step": 1885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:33.849419", "step": 1885, "epoch": 3 }, { "type": "loss", "content": 0.0325036458671093, "timestamp": "2025-09-30 22:12:33.852113", "step": 1886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:33.912628", "step": 1886, "epoch": 3 }, { "type": "loss", "content": 0.02236909046769142, "timestamp": "2025-09-30 22:12:33.915640", "step": 1887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:33.973272", "step": 1887, "epoch": 3 }, { "type": "loss", "content": 0.018104365095496178, "timestamp": "2025-09-30 22:12:33.979563", "step": 1888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.034551", "step": 1888, "epoch": 3 }, { "type": "loss", "content": 0.01875036023557186, "timestamp": "2025-09-30 22:12:34.037045", "step": 1889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.095906", "step": 1889, "epoch": 3 }, { "type": "loss", "content": 0.010548067279160023, "timestamp": "2025-09-30 22:12:34.098658", "step": 1890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.155832", "step": 1890, "epoch": 3 }, { "type": "loss", "content": 0.007385232951492071, "timestamp": "2025-09-30 22:12:34.159492", "step": 1891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.214578", "step": 1891, "epoch": 3 }, { "type": "loss", "content": 0.00859599094837904, "timestamp": "2025-09-30 22:12:34.226724", "step": 1892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.287077", "step": 1892, "epoch": 3 }, { "type": "loss", "content": 0.011528218165040016, "timestamp": "2025-09-30 22:12:34.289393", "step": 1893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.344245", "step": 1893, "epoch": 3 }, { "type": "loss", "content": 0.03839213401079178, "timestamp": "2025-09-30 22:12:34.347664", "step": 1894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.413440", "step": 1894, "epoch": 3 }, { "type": "loss", "content": 0.004163206089287996, "timestamp": "2025-09-30 22:12:34.420353", "step": 1895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.476133", "step": 1895, "epoch": 3 }, { "type": "loss", "content": 0.006831273436546326, "timestamp": "2025-09-30 22:12:34.482107", "step": 1896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.550831", "step": 1896, "epoch": 3 }, { "type": "loss", "content": 0.0011001095408573747, "timestamp": "2025-09-30 22:12:34.553869", "step": 1897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.613063", "step": 1897, "epoch": 3 }, { "type": "loss", "content": 0.002597188577055931, "timestamp": "2025-09-30 22:12:34.619835", "step": 1898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.678929", "step": 1898, "epoch": 3 }, { "type": "loss", "content": 0.003021536162123084, "timestamp": "2025-09-30 22:12:34.681965", "step": 1899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.736489", "step": 1899, "epoch": 3 }, { "type": "loss", "content": 0.035559866577386856, "timestamp": "2025-09-30 22:12:34.747555", "step": 1900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.808191", "step": 1900, "epoch": 3 }, { "type": "loss", "content": 0.00288590369746089, "timestamp": "2025-09-30 22:12:34.812074", "step": 1901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:34.875165", "step": 1901, "epoch": 3 }, { "type": "loss", "content": 0.010649462230503559, "timestamp": "2025-09-30 22:12:34.877318", "step": 1902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:34.938011", "step": 1902, "epoch": 3 }, { "type": "loss", "content": 0.020891645923256874, "timestamp": "2025-09-30 22:12:34.946875", "step": 1903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.007218", "step": 1903, "epoch": 3 }, { "type": "loss", "content": 0.034685712307691574, "timestamp": "2025-09-30 22:12:35.019492", "step": 1904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.075150", "step": 1904, "epoch": 3 }, { "type": "loss", "content": 0.0015056979609653354, "timestamp": "2025-09-30 22:12:35.078579", "step": 1905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.133854", "step": 1905, "epoch": 3 }, { "type": "loss", "content": 0.036593399941921234, "timestamp": "2025-09-30 22:12:35.137590", "step": 1906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:35.193504", "step": 1906, "epoch": 3 }, { "type": "loss", "content": 0.0018256593029946089, "timestamp": "2025-09-30 22:12:35.196477", "step": 1907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.255154", "step": 1907, "epoch": 3 }, { "type": "loss", "content": 0.04090496152639389, "timestamp": "2025-09-30 22:12:35.261202", "step": 1908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.317711", "step": 1908, "epoch": 3 }, { "type": "loss", "content": 0.014386068098247051, "timestamp": "2025-09-30 22:12:35.321688", "step": 1909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:35.386201", "step": 1909, "epoch": 3 }, { "type": "loss", "content": 0.018922999501228333, "timestamp": "2025-09-30 22:12:35.390261", "step": 1910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.449885", "step": 1910, "epoch": 3 }, { "type": "loss", "content": 0.015430129133164883, "timestamp": "2025-09-30 22:12:35.456042", "step": 1911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.516289", "step": 1911, "epoch": 3 }, { "type": "loss", "content": 0.014453909359872341, "timestamp": "2025-09-30 22:12:35.522502", "step": 1912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.583529", "step": 1912, "epoch": 3 }, { "type": "loss", "content": 0.019513025879859924, "timestamp": "2025-09-30 22:12:35.586244", "step": 1913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:35.644204", "step": 1913, "epoch": 3 }, { "type": "loss", "content": 0.011197819374501705, "timestamp": "2025-09-30 22:12:35.659235", "step": 1914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.718192", "step": 1914, "epoch": 3 }, { "type": "loss", "content": 0.016041845083236694, "timestamp": "2025-09-30 22:12:35.720928", "step": 1915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.776597", "step": 1915, "epoch": 3 }, { "type": "loss", "content": 0.008869746699929237, "timestamp": "2025-09-30 22:12:35.786518", "step": 1916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.843545", "step": 1916, "epoch": 3 }, { "type": "loss", "content": 0.009099365212023258, "timestamp": "2025-09-30 22:12:35.847455", "step": 1917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.904360", "step": 1917, "epoch": 3 }, { "type": "loss", "content": 0.014621244743466377, "timestamp": "2025-09-30 22:12:35.909624", "step": 1918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:35.965569", "step": 1918, "epoch": 3 }, { "type": "loss", "content": 0.009779931046068668, "timestamp": "2025-09-30 22:12:35.981765", "step": 1919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.046589", "step": 1919, "epoch": 3 }, { "type": "loss", "content": 0.016505222767591476, "timestamp": "2025-09-30 22:12:36.060991", "step": 1920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.125986", "step": 1920, "epoch": 3 }, { "type": "loss", "content": 0.006841294001787901, "timestamp": "2025-09-30 22:12:36.129710", "step": 1921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.189066", "step": 1921, "epoch": 3 }, { "type": "loss", "content": 0.009847632609307766, "timestamp": "2025-09-30 22:12:36.195855", "step": 1922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.252944", "step": 1922, "epoch": 3 }, { "type": "loss", "content": 0.02639087848365307, "timestamp": "2025-09-30 22:12:36.258090", "step": 1923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:36.319740", "step": 1923, "epoch": 3 }, { "type": "loss", "content": 0.01638978160917759, "timestamp": "2025-09-30 22:12:36.326755", "step": 1924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.386575", "step": 1924, "epoch": 3 }, { "type": "loss", "content": 0.028748007491230965, "timestamp": "2025-09-30 22:12:36.390353", "step": 1925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:36.444542", "step": 1925, "epoch": 3 }, { "type": "loss", "content": 0.018612733110785484, "timestamp": "2025-09-30 22:12:36.447012", "step": 1926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.501961", "step": 1926, "epoch": 3 }, { "type": "loss", "content": 0.004065474960952997, "timestamp": "2025-09-30 22:12:36.508877", "step": 1927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.565538", "step": 1927, "epoch": 3 }, { "type": "loss", "content": 0.006613335572183132, "timestamp": "2025-09-30 22:12:36.571650", "step": 1928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.625346", "step": 1928, "epoch": 3 }, { "type": "loss", "content": 0.024656997993588448, "timestamp": "2025-09-30 22:12:36.628060", "step": 1929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.686966", "step": 1929, "epoch": 3 }, { "type": "loss", "content": 0.04802418872714043, "timestamp": "2025-09-30 22:12:36.689252", "step": 1930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.742692", "step": 1930, "epoch": 3 }, { "type": "loss", "content": 0.021498506888747215, "timestamp": "2025-09-30 22:12:36.745190", "step": 1931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.799740", "step": 1931, "epoch": 3 }, { "type": "loss", "content": 0.008578136563301086, "timestamp": "2025-09-30 22:12:36.805549", "step": 1932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.861069", "step": 1932, "epoch": 3 }, { "type": "loss", "content": 0.0029331946279853582, "timestamp": "2025-09-30 22:12:36.863376", "step": 1933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.925739", "step": 1933, "epoch": 3 }, { "type": "loss", "content": 0.01277841441333294, "timestamp": "2025-09-30 22:12:36.928011", "step": 1934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:36.989085", "step": 1934, "epoch": 3 }, { "type": "loss", "content": 0.008462684229016304, "timestamp": "2025-09-30 22:12:36.992617", "step": 1935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:37.058882", "step": 1935, "epoch": 3 }, { "type": "loss", "content": 0.00633528595790267, "timestamp": "2025-09-30 22:12:37.064520", "step": 1936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:37.126025", "step": 1936, "epoch": 3 }, { "type": "loss", "content": 0.009705213829874992, "timestamp": "2025-09-30 22:12:37.128017", "step": 1937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:37.183503", "step": 1937, "epoch": 3 }, { "type": "loss", "content": 0.009753524325788021, "timestamp": "2025-09-30 22:12:37.187696", "step": 1938, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:38.512858", "step": 1938, "epoch": 3 }, { "type": "pplx", "content": 29558369.15336448, "timestamp": "2025-09-30 22:12:38.514909", "step": 1938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:38.567034", "step": 1938, "epoch": 3 }, { "type": "loss", "content": 0.025362467393279076, "timestamp": "2025-09-30 22:12:38.571280", "step": 1939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:38.646596", "step": 1939, "epoch": 3 }, { "type": "loss", "content": 0.004295314662158489, "timestamp": "2025-09-30 22:12:38.654788", "step": 1940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:38.717141", "step": 1940, "epoch": 3 }, { "type": "loss", "content": 0.0038865043316036463, "timestamp": "2025-09-30 22:12:38.723450", "step": 1941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:38.785516", "step": 1941, "epoch": 3 }, { "type": "loss", "content": 0.016797490417957306, "timestamp": "2025-09-30 22:12:38.790002", "step": 1942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:38.845954", "step": 1942, "epoch": 3 }, { "type": "loss", "content": 0.031100988388061523, "timestamp": "2025-09-30 22:12:38.850849", "step": 1943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:38.924658", "step": 1943, "epoch": 3 }, { "type": "loss", "content": 0.0393851213157177, "timestamp": "2025-09-30 22:12:38.935303", "step": 1944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:38.996063", "step": 1944, "epoch": 3 }, { "type": "loss", "content": 0.005985604133456945, "timestamp": "2025-09-30 22:12:39.001238", "step": 1945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.058690", "step": 1945, "epoch": 3 }, { "type": "loss", "content": 0.008343399502336979, "timestamp": "2025-09-30 22:12:39.064100", "step": 1946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:39.120940", "step": 1946, "epoch": 3 }, { "type": "loss", "content": 0.003788830479606986, "timestamp": "2025-09-30 22:12:39.126550", "step": 1947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.185776", "step": 1947, "epoch": 3 }, { "type": "loss", "content": 0.005876143928617239, "timestamp": "2025-09-30 22:12:39.194044", "step": 1948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:39.247405", "step": 1948, "epoch": 3 }, { "type": "loss", "content": 0.009399537928402424, "timestamp": "2025-09-30 22:12:39.252551", "step": 1949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:39.317976", "step": 1949, "epoch": 3 }, { "type": "loss", "content": 0.0053526838310062885, "timestamp": "2025-09-30 22:12:39.321738", "step": 1950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:39.384587", "step": 1950, "epoch": 3 }, { "type": "loss", "content": 0.021824544295668602, "timestamp": "2025-09-30 22:12:39.389592", "step": 1951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.448210", "step": 1951, "epoch": 3 }, { "type": "loss", "content": 0.016011234372854233, "timestamp": "2025-09-30 22:12:39.457427", "step": 1952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:39.515827", "step": 1952, "epoch": 3 }, { "type": "loss", "content": 0.0025454445276409388, "timestamp": "2025-09-30 22:12:39.522667", "step": 1953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:39.580991", "step": 1953, "epoch": 3 }, { "type": "loss", "content": 0.003802164224907756, "timestamp": "2025-09-30 22:12:39.584553", "step": 1954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.644276", "step": 1954, "epoch": 3 }, { "type": "loss", "content": 0.03186912089586258, "timestamp": "2025-09-30 22:12:39.646374", "step": 1955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:39.704066", "step": 1955, "epoch": 3 }, { "type": "loss", "content": 0.023978808894753456, "timestamp": "2025-09-30 22:12:39.710510", "step": 1956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:39.767903", "step": 1956, "epoch": 3 }, { "type": "loss", "content": 0.01542146410793066, "timestamp": "2025-09-30 22:12:39.770627", "step": 1957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.824884", "step": 1957, "epoch": 3 }, { "type": "loss", "content": 0.009438499808311462, "timestamp": "2025-09-30 22:12:39.830557", "step": 1958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.900535", "step": 1958, "epoch": 3 }, { "type": "loss", "content": 0.00540934456512332, "timestamp": "2025-09-30 22:12:39.903028", "step": 1959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:39.958455", "step": 1959, "epoch": 3 }, { "type": "loss", "content": 0.034282151609659195, "timestamp": "2025-09-30 22:12:39.964357", "step": 1960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:40.020785", "step": 1960, "epoch": 3 }, { "type": "loss", "content": 0.006182772573083639, "timestamp": "2025-09-30 22:12:40.022932", "step": 1961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.077532", "step": 1961, "epoch": 3 }, { "type": "loss", "content": 0.004338286817073822, "timestamp": "2025-09-30 22:12:40.079805", "step": 1962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.134069", "step": 1962, "epoch": 3 }, { "type": "loss", "content": 0.012008560821413994, "timestamp": "2025-09-30 22:12:40.136462", "step": 1963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.217923", "step": 1963, "epoch": 3 }, { "type": "loss", "content": 0.03428441658616066, "timestamp": "2025-09-30 22:12:40.223659", "step": 1964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.278527", "step": 1964, "epoch": 3 }, { "type": "loss", "content": 0.008386987261474133, "timestamp": "2025-09-30 22:12:40.280482", "step": 1965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.340067", "step": 1965, "epoch": 3 }, { "type": "loss", "content": 0.0035431894939392805, "timestamp": "2025-09-30 22:12:40.352999", "step": 1966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.426650", "step": 1966, "epoch": 3 }, { "type": "loss", "content": 0.005195711273699999, "timestamp": "2025-09-30 22:12:40.428878", "step": 1967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.483244", "step": 1967, "epoch": 3 }, { "type": "loss", "content": 0.014768867753446102, "timestamp": "2025-09-30 22:12:40.489131", "step": 1968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:40.541727", "step": 1968, "epoch": 3 }, { "type": "loss", "content": 0.0026658920105546713, "timestamp": "2025-09-30 22:12:40.544813", "step": 1969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.605167", "step": 1969, "epoch": 3 }, { "type": "loss", "content": 0.00781507883220911, "timestamp": "2025-09-30 22:12:40.607386", "step": 1970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:40.671135", "step": 1970, "epoch": 3 }, { "type": "loss", "content": 0.012213180772960186, "timestamp": "2025-09-30 22:12:40.672951", "step": 1971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.731179", "step": 1971, "epoch": 3 }, { "type": "loss", "content": 0.030378760769963264, "timestamp": "2025-09-30 22:12:40.739563", "step": 1972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:40.796297", "step": 1972, "epoch": 3 }, { "type": "loss", "content": 0.014325362630188465, "timestamp": "2025-09-30 22:12:40.800560", "step": 1973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.865717", "step": 1973, "epoch": 3 }, { "type": "loss", "content": 0.006878285203129053, "timestamp": "2025-09-30 22:12:40.871537", "step": 1974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:40.943601", "step": 1974, "epoch": 3 }, { "type": "loss", "content": 0.019338076934218407, "timestamp": "2025-09-30 22:12:40.960489", "step": 1975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.020679", "step": 1975, "epoch": 3 }, { "type": "loss", "content": 0.013357887975871563, "timestamp": "2025-09-30 22:12:41.029453", "step": 1976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.107755", "step": 1976, "epoch": 3 }, { "type": "loss", "content": 0.00660253269597888, "timestamp": "2025-09-30 22:12:41.126488", "step": 1977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:41.203228", "step": 1977, "epoch": 3 }, { "type": "loss", "content": 0.006124368868768215, "timestamp": "2025-09-30 22:12:41.221345", "step": 1978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:41.292381", "step": 1978, "epoch": 3 }, { "type": "loss", "content": 0.00962238758802414, "timestamp": "2025-09-30 22:12:41.299224", "step": 1979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:41.357369", "step": 1979, "epoch": 3 }, { "type": "loss", "content": 0.002619996899738908, "timestamp": "2025-09-30 22:12:41.379860", "step": 1980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.448904", "step": 1980, "epoch": 3 }, { "type": "loss", "content": 0.008656610734760761, "timestamp": "2025-09-30 22:12:41.452895", "step": 1981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.515103", "step": 1981, "epoch": 3 }, { "type": "loss", "content": 0.028397956863045692, "timestamp": "2025-09-30 22:12:41.519895", "step": 1982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:41.586905", "step": 1982, "epoch": 3 }, { "type": "loss", "content": 0.02610507234930992, "timestamp": "2025-09-30 22:12:41.590858", "step": 1983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.654054", "step": 1983, "epoch": 3 }, { "type": "loss", "content": 0.018599865958094597, "timestamp": "2025-09-30 22:12:41.671758", "step": 1984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.737134", "step": 1984, "epoch": 3 }, { "type": "loss", "content": 0.005543197970837355, "timestamp": "2025-09-30 22:12:41.742183", "step": 1985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:41.813581", "step": 1985, "epoch": 3 }, { "type": "loss", "content": 0.002317222999408841, "timestamp": "2025-09-30 22:12:41.815633", "step": 1986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:41.881426", "step": 1986, "epoch": 3 }, { "type": "loss", "content": 0.014996451325714588, "timestamp": "2025-09-30 22:12:41.883416", "step": 1987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:41.937781", "step": 1987, "epoch": 3 }, { "type": "loss", "content": 0.017824586480855942, "timestamp": "2025-09-30 22:12:41.943617", "step": 1988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.001541", "step": 1988, "epoch": 3 }, { "type": "loss", "content": 0.026619719341397285, "timestamp": "2025-09-30 22:12:42.004640", "step": 1989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.068935", "step": 1989, "epoch": 3 }, { "type": "loss", "content": 0.0035347731318324804, "timestamp": "2025-09-30 22:12:42.071286", "step": 1990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.126117", "step": 1990, "epoch": 3 }, { "type": "loss", "content": 0.03799136355519295, "timestamp": "2025-09-30 22:12:42.128517", "step": 1991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.184013", "step": 1991, "epoch": 3 }, { "type": "loss", "content": 0.04470697417855263, "timestamp": "2025-09-30 22:12:42.189722", "step": 1992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:42.244769", "step": 1992, "epoch": 3 }, { "type": "loss", "content": 0.05278744548559189, "timestamp": "2025-09-30 22:12:42.249219", "step": 1993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.315365", "step": 1993, "epoch": 3 }, { "type": "loss", "content": 0.01737930439412594, "timestamp": "2025-09-30 22:12:42.318475", "step": 1994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:42.383369", "step": 1994, "epoch": 3 }, { "type": "loss", "content": 0.01412777416408062, "timestamp": "2025-09-30 22:12:42.385937", "step": 1995, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:43.827522", "step": 1995, "epoch": 3 }, { "type": "pplx", "content": 30986920.64344966, "timestamp": "2025-09-30 22:12:43.836901", "step": 1995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:43.896684", "step": 1995, "epoch": 3 }, { "type": "loss", "content": 0.007152962498366833, "timestamp": "2025-09-30 22:12:43.904023", "step": 1996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:43.962205", "step": 1996, "epoch": 3 }, { "type": "loss", "content": 0.008768231607973576, "timestamp": "2025-09-30 22:12:43.965930", "step": 1997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:44.024945", "step": 1997, "epoch": 3 }, { "type": "loss", "content": 0.014288007281720638, "timestamp": "2025-09-30 22:12:44.028859", "step": 1998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:44.086462", "step": 1998, "epoch": 3 }, { "type": "loss", "content": 0.025513725355267525, "timestamp": "2025-09-30 22:12:44.090556", "step": 1999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:44.145220", "step": 1999, "epoch": 3 }, { "type": "loss", "content": 0.022684959694743156, "timestamp": "2025-09-30 22:12:44.152892", "step": 2000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-30 22:12:44.704758", "step": 2000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:44.762348", "step": 2000, "epoch": 3 }, { "type": "loss", "content": 0.01981218159198761, "timestamp": "2025-09-30 22:12:44.765533", "step": 2001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:44.837870", "step": 2001, "epoch": 3 }, { "type": "loss", "content": 0.006580715533345938, "timestamp": "2025-09-30 22:12:44.840899", "step": 2002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:44.910869", "step": 2002, "epoch": 3 }, { "type": "loss", "content": 0.007349673192948103, "timestamp": "2025-09-30 22:12:44.924598", "step": 2003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:44.990101", "step": 2003, "epoch": 3 }, { "type": "loss", "content": 0.03402791544795036, "timestamp": "2025-09-30 22:12:44.996862", "step": 2004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.064039", "step": 2004, "epoch": 3 }, { "type": "loss", "content": 0.0037932060658931732, "timestamp": "2025-09-30 22:12:45.073329", "step": 2005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:45.132903", "step": 2005, "epoch": 3 }, { "type": "loss", "content": 0.03237783536314964, "timestamp": "2025-09-30 22:12:45.135893", "step": 2006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.204595", "step": 2006, "epoch": 3 }, { "type": "loss", "content": 0.012929347343742847, "timestamp": "2025-09-30 22:12:45.207303", "step": 2007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.265577", "step": 2007, "epoch": 3 }, { "type": "loss", "content": 0.01525042299181223, "timestamp": "2025-09-30 22:12:45.277214", "step": 2008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.338311", "step": 2008, "epoch": 3 }, { "type": "loss", "content": 0.018445158377289772, "timestamp": "2025-09-30 22:12:45.342291", "step": 2009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:45.406350", "step": 2009, "epoch": 3 }, { "type": "loss", "content": 0.023437367752194405, "timestamp": "2025-09-30 22:12:45.416485", "step": 2010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.478091", "step": 2010, "epoch": 3 }, { "type": "loss", "content": 0.022062715142965317, "timestamp": "2025-09-30 22:12:45.487759", "step": 2011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.555782", "step": 2011, "epoch": 3 }, { "type": "loss", "content": 0.022687969729304314, "timestamp": "2025-09-30 22:12:45.562458", "step": 2012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.629028", "step": 2012, "epoch": 3 }, { "type": "loss", "content": 0.007122944109141827, "timestamp": "2025-09-30 22:12:45.631721", "step": 2013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.688904", "step": 2013, "epoch": 3 }, { "type": "loss", "content": 0.007444100920110941, "timestamp": "2025-09-30 22:12:45.692505", "step": 2014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.747857", "step": 2014, "epoch": 3 }, { "type": "loss", "content": 0.02668391726911068, "timestamp": "2025-09-30 22:12:45.757716", "step": 2015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.824072", "step": 2015, "epoch": 3 }, { "type": "loss", "content": 0.01846177875995636, "timestamp": "2025-09-30 22:12:45.830050", "step": 2016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:45.888760", "step": 2016, "epoch": 3 }, { "type": "loss", "content": 0.013639253564178944, "timestamp": "2025-09-30 22:12:45.898113", "step": 2017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:45.960139", "step": 2017, "epoch": 3 }, { "type": "loss", "content": 0.01387722697108984, "timestamp": "2025-09-30 22:12:45.968363", "step": 2018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:46.026652", "step": 2018, "epoch": 3 }, { "type": "loss", "content": 0.00344693916849792, "timestamp": "2025-09-30 22:12:46.036303", "step": 2019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.092962", "step": 2019, "epoch": 3 }, { "type": "loss", "content": 0.010282598435878754, "timestamp": "2025-09-30 22:12:46.101409", "step": 2020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.156391", "step": 2020, "epoch": 3 }, { "type": "loss", "content": 0.006298901047557592, "timestamp": "2025-09-30 22:12:46.159534", "step": 2021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.213661", "step": 2021, "epoch": 3 }, { "type": "loss", "content": 0.008653373457491398, "timestamp": "2025-09-30 22:12:46.217113", "step": 2022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.271623", "step": 2022, "epoch": 3 }, { "type": "loss", "content": 0.0037791277281939983, "timestamp": "2025-09-30 22:12:46.276546", "step": 2023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.338579", "step": 2023, "epoch": 3 }, { "type": "loss", "content": 0.014515328221023083, "timestamp": "2025-09-30 22:12:46.351822", "step": 2024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.405559", "step": 2024, "epoch": 3 }, { "type": "loss", "content": 0.013266796246170998, "timestamp": "2025-09-30 22:12:46.416235", "step": 2025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:46.474075", "step": 2025, "epoch": 3 }, { "type": "loss", "content": 0.002332469215616584, "timestamp": "2025-09-30 22:12:46.483229", "step": 2026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.547920", "step": 2026, "epoch": 3 }, { "type": "loss", "content": 0.00418002950027585, "timestamp": "2025-09-30 22:12:46.559265", "step": 2027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.614126", "step": 2027, "epoch": 3 }, { "type": "loss", "content": 0.004884445574134588, "timestamp": "2025-09-30 22:12:46.621285", "step": 2028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.683577", "step": 2028, "epoch": 3 }, { "type": "loss", "content": 0.01921374723315239, "timestamp": "2025-09-30 22:12:46.687247", "step": 2029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:46.754441", "step": 2029, "epoch": 3 }, { "type": "loss", "content": 0.01416950486600399, "timestamp": "2025-09-30 22:12:46.764603", "step": 2030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.826453", "step": 2030, "epoch": 3 }, { "type": "loss", "content": 0.004991916473954916, "timestamp": "2025-09-30 22:12:46.829235", "step": 2031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:46.891979", "step": 2031, "epoch": 3 }, { "type": "loss", "content": 0.00923225563019514, "timestamp": "2025-09-30 22:12:46.899939", "step": 2032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:46.960726", "step": 2032, "epoch": 3 }, { "type": "loss", "content": 0.029489969834685326, "timestamp": "2025-09-30 22:12:46.963458", "step": 2033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.025987", "step": 2033, "epoch": 3 }, { "type": "loss", "content": 0.03119957633316517, "timestamp": "2025-09-30 22:12:47.036326", "step": 2034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:47.092090", "step": 2034, "epoch": 3 }, { "type": "loss", "content": 0.013563781045377254, "timestamp": "2025-09-30 22:12:47.103675", "step": 2035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.172647", "step": 2035, "epoch": 3 }, { "type": "loss", "content": 0.013546071946620941, "timestamp": "2025-09-30 22:12:47.179641", "step": 2036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.235262", "step": 2036, "epoch": 3 }, { "type": "loss", "content": 0.013057815842330456, "timestamp": "2025-09-30 22:12:47.237721", "step": 2037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:47.304263", "step": 2037, "epoch": 3 }, { "type": "loss", "content": 0.028074579313397408, "timestamp": "2025-09-30 22:12:47.307359", "step": 2038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:47.365018", "step": 2038, "epoch": 3 }, { "type": "loss", "content": 0.0029875219333916903, "timestamp": "2025-09-30 22:12:47.376522", "step": 2039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.440639", "step": 2039, "epoch": 3 }, { "type": "loss", "content": 0.002364709507673979, "timestamp": "2025-09-30 22:12:47.453613", "step": 2040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.519200", "step": 2040, "epoch": 3 }, { "type": "loss", "content": 0.01649676077067852, "timestamp": "2025-09-30 22:12:47.528985", "step": 2041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.591132", "step": 2041, "epoch": 3 }, { "type": "loss", "content": 0.0018646756652742624, "timestamp": "2025-09-30 22:12:47.594259", "step": 2042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.657779", "step": 2042, "epoch": 3 }, { "type": "loss", "content": 0.018227288499474525, "timestamp": "2025-09-30 22:12:47.662542", "step": 2043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.725112", "step": 2043, "epoch": 3 }, { "type": "loss", "content": 0.02195359393954277, "timestamp": "2025-09-30 22:12:47.739489", "step": 2044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.807304", "step": 2044, "epoch": 3 }, { "type": "loss", "content": 0.00557445315644145, "timestamp": "2025-09-30 22:12:47.811883", "step": 2045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.868755", "step": 2045, "epoch": 3 }, { "type": "loss", "content": 0.04431688040494919, "timestamp": "2025-09-30 22:12:47.873901", "step": 2046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:47.937423", "step": 2046, "epoch": 3 }, { "type": "loss", "content": 0.018206002190709114, "timestamp": "2025-09-30 22:12:47.947104", "step": 2047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:48.006345", "step": 2047, "epoch": 3 }, { "type": "loss", "content": 0.006048198323696852, "timestamp": "2025-09-30 22:12:48.024417", "step": 2048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:48.092028", "step": 2048, "epoch": 3 }, { "type": "loss", "content": 0.004360709339380264, "timestamp": "2025-09-30 22:12:48.107431", "step": 2049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:48.174773", "step": 2049, "epoch": 3 }, { "type": "loss", "content": 0.011058392934501171, "timestamp": "2025-09-30 22:12:48.187116", "step": 2050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:48.242660", "step": 2050, "epoch": 3 }, { "type": "loss", "content": 0.00571943586692214, "timestamp": "2025-09-30 22:12:48.249247", "step": 2051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:48.310667", "step": 2051, "epoch": 3 }, { "type": "loss", "content": 0.026921484619379044, "timestamp": "2025-09-30 22:12:48.325873", "step": 2052, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:49.721444", "step": 2052, "epoch": 3 }, { "type": "pplx", "content": 31591669.075829722, "timestamp": "2025-09-30 22:12:49.725309", "step": 2052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:49.784999", "step": 2052, "epoch": 3 }, { "type": "loss", "content": 0.007747288327664137, "timestamp": "2025-09-30 22:12:49.797589", "step": 2053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:49.852580", "step": 2053, "epoch": 3 }, { "type": "loss", "content": 0.0032807444222271442, "timestamp": "2025-09-30 22:12:49.855656", "step": 2054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:49.917143", "step": 2054, "epoch": 3 }, { "type": "loss", "content": 0.013838792219758034, "timestamp": "2025-09-30 22:12:49.931141", "step": 2055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:49.987250", "step": 2055, "epoch": 3 }, { "type": "loss", "content": 0.002472275635227561, "timestamp": "2025-09-30 22:12:49.995095", "step": 2056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.050210", "step": 2056, "epoch": 3 }, { "type": "loss", "content": 0.01641083136200905, "timestamp": "2025-09-30 22:12:50.061765", "step": 2057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.122990", "step": 2057, "epoch": 3 }, { "type": "loss", "content": 0.00696180434897542, "timestamp": "2025-09-30 22:12:50.126640", "step": 2058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.183835", "step": 2058, "epoch": 3 }, { "type": "loss", "content": 0.006303395610302687, "timestamp": "2025-09-30 22:12:50.196071", "step": 2059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.252924", "step": 2059, "epoch": 3 }, { "type": "loss", "content": 0.011265805922448635, "timestamp": "2025-09-30 22:12:50.267874", "step": 2060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.326364", "step": 2060, "epoch": 3 }, { "type": "loss", "content": 0.01980595663189888, "timestamp": "2025-09-30 22:12:50.341639", "step": 2061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:50.400756", "step": 2061, "epoch": 3 }, { "type": "loss", "content": 0.002819489687681198, "timestamp": "2025-09-30 22:12:50.406844", "step": 2062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:50.463800", "step": 2062, "epoch": 3 }, { "type": "loss", "content": 0.007079659961163998, "timestamp": "2025-09-30 22:12:50.467330", "step": 2063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.524689", "step": 2063, "epoch": 3 }, { "type": "loss", "content": 0.009019630961120129, "timestamp": "2025-09-30 22:12:50.533155", "step": 2064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.603580", "step": 2064, "epoch": 3 }, { "type": "loss", "content": 0.0046064346097409725, "timestamp": "2025-09-30 22:12:50.606342", "step": 2065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.662393", "step": 2065, "epoch": 3 }, { "type": "loss", "content": 0.025710809975862503, "timestamp": "2025-09-30 22:12:50.675290", "step": 2066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:50.739641", "step": 2066, "epoch": 3 }, { "type": "loss", "content": 0.01167394407093525, "timestamp": "2025-09-30 22:12:50.742662", "step": 2067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.805429", "step": 2067, "epoch": 3 }, { "type": "loss", "content": 0.021902693435549736, "timestamp": "2025-09-30 22:12:50.812165", "step": 2068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.871441", "step": 2068, "epoch": 3 }, { "type": "loss", "content": 0.004060753621160984, "timestamp": "2025-09-30 22:12:50.874617", "step": 2069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:50.929615", "step": 2069, "epoch": 3 }, { "type": "loss", "content": 0.023838216438889503, "timestamp": "2025-09-30 22:12:50.933346", "step": 2070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:50.988296", "step": 2070, "epoch": 3 }, { "type": "loss", "content": 0.000567333830986172, "timestamp": "2025-09-30 22:12:50.991262", "step": 2071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.048304", "step": 2071, "epoch": 3 }, { "type": "loss", "content": 0.033882711082696915, "timestamp": "2025-09-30 22:12:51.054912", "step": 2072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.125544", "step": 2072, "epoch": 3 }, { "type": "loss", "content": 0.01983877643942833, "timestamp": "2025-09-30 22:12:51.128731", "step": 2073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.183987", "step": 2073, "epoch": 3 }, { "type": "loss", "content": 0.012886099517345428, "timestamp": "2025-09-30 22:12:51.192788", "step": 2074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:51.255083", "step": 2074, "epoch": 3 }, { "type": "loss", "content": 0.005958546884357929, "timestamp": "2025-09-30 22:12:51.257289", "step": 2075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.325526", "step": 2075, "epoch": 3 }, { "type": "loss", "content": 0.005503936670720577, "timestamp": "2025-09-30 22:12:51.333228", "step": 2076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.387290", "step": 2076, "epoch": 3 }, { "type": "loss", "content": 0.0063980803824961185, "timestamp": "2025-09-30 22:12:51.396594", "step": 2077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:51.456724", "step": 2077, "epoch": 3 }, { "type": "loss", "content": 0.012718225829303265, "timestamp": "2025-09-30 22:12:51.459467", "step": 2078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.520522", "step": 2078, "epoch": 3 }, { "type": "loss", "content": 0.02810058370232582, "timestamp": "2025-09-30 22:12:51.523666", "step": 2079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.578431", "step": 2079, "epoch": 3 }, { "type": "loss", "content": 0.00501110078766942, "timestamp": "2025-09-30 22:12:51.585017", "step": 2080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:51.640545", "step": 2080, "epoch": 3 }, { "type": "loss", "content": 0.015446553006768227, "timestamp": "2025-09-30 22:12:51.649823", "step": 2081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.706135", "step": 2081, "epoch": 3 }, { "type": "loss", "content": 0.006646531168371439, "timestamp": "2025-09-30 22:12:51.709919", "step": 2082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.770248", "step": 2082, "epoch": 3 }, { "type": "loss", "content": 0.007312596309930086, "timestamp": "2025-09-30 22:12:51.773296", "step": 2083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:51.834524", "step": 2083, "epoch": 3 }, { "type": "loss", "content": 0.001499996636994183, "timestamp": "2025-09-30 22:12:51.847405", "step": 2084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.901463", "step": 2084, "epoch": 3 }, { "type": "loss", "content": 0.03770451620221138, "timestamp": "2025-09-30 22:12:51.904924", "step": 2085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:51.961949", "step": 2085, "epoch": 3 }, { "type": "loss", "content": 0.013079524040222168, "timestamp": "2025-09-30 22:12:51.965128", "step": 2086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:52.023658", "step": 2086, "epoch": 3 }, { "type": "loss", "content": 0.007665101904422045, "timestamp": "2025-09-30 22:12:52.026475", "step": 2087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:52.099632", "step": 2087, "epoch": 3 }, { "type": "loss", "content": 0.014709694311022758, "timestamp": "2025-09-30 22:12:52.114550", "step": 2088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:52.174855", "step": 2088, "epoch": 3 }, { "type": "loss", "content": 0.0035318650770932436, "timestamp": "2025-09-30 22:12:52.178697", "step": 2089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:52.242420", "step": 2089, "epoch": 3 }, { "type": "loss", "content": 0.0026837512850761414, "timestamp": "2025-09-30 22:12:52.245902", "step": 2090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.303214", "step": 2090, "epoch": 3 }, { "type": "loss", "content": 0.003676437307149172, "timestamp": "2025-09-30 22:12:52.307125", "step": 2091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.366135", "step": 2091, "epoch": 3 }, { "type": "loss", "content": 0.022839205339550972, "timestamp": "2025-09-30 22:12:52.375466", "step": 2092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.432074", "step": 2092, "epoch": 3 }, { "type": "loss", "content": 0.0020653358660638332, "timestamp": "2025-09-30 22:12:52.435305", "step": 2093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:52.496909", "step": 2093, "epoch": 3 }, { "type": "loss", "content": 0.010991484858095646, "timestamp": "2025-09-30 22:12:52.500189", "step": 2094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.560748", "step": 2094, "epoch": 3 }, { "type": "loss", "content": 0.005481290630996227, "timestamp": "2025-09-30 22:12:52.565143", "step": 2095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.620407", "step": 2095, "epoch": 3 }, { "type": "loss", "content": 0.04029859974980354, "timestamp": "2025-09-30 22:12:52.627397", "step": 2096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:52.684592", "step": 2096, "epoch": 3 }, { "type": "loss", "content": 0.0042072138749063015, "timestamp": "2025-09-30 22:12:52.689409", "step": 2097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:52.747358", "step": 2097, "epoch": 3 }, { "type": "loss", "content": 0.049394670873880386, "timestamp": "2025-09-30 22:12:52.751278", "step": 2098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.806129", "step": 2098, "epoch": 3 }, { "type": "loss", "content": 0.020421581342816353, "timestamp": "2025-09-30 22:12:52.809621", "step": 2099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:52.865837", "step": 2099, "epoch": 3 }, { "type": "loss", "content": 0.010594218969345093, "timestamp": "2025-09-30 22:12:52.872798", "step": 2100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:52.926620", "step": 2100, "epoch": 3 }, { "type": "loss", "content": 0.0015280555235221982, "timestamp": "2025-09-30 22:12:52.935170", "step": 2101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:52.990431", "step": 2101, "epoch": 3 }, { "type": "loss", "content": 0.0070367841981351376, "timestamp": "2025-09-30 22:12:52.993310", "step": 2102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:53.050292", "step": 2102, "epoch": 3 }, { "type": "loss", "content": 0.030247675254940987, "timestamp": "2025-09-30 22:12:53.053333", "step": 2103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:53.116406", "step": 2103, "epoch": 3 }, { "type": "loss", "content": 0.0010143903782591224, "timestamp": "2025-09-30 22:12:53.123065", "step": 2104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:53.176416", "step": 2104, "epoch": 3 }, { "type": "loss", "content": 0.0039184377528727055, "timestamp": "2025-09-30 22:12:53.179306", "step": 2105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:53.235365", "step": 2105, "epoch": 3 }, { "type": "loss", "content": 0.007116036955267191, "timestamp": "2025-09-30 22:12:53.238300", "step": 2106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:53.293136", "step": 2106, "epoch": 3 }, { "type": "loss", "content": 0.02538345754146576, "timestamp": "2025-09-30 22:12:53.296719", "step": 2107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:53.358073", "step": 2107, "epoch": 3 }, { "type": "loss", "content": 0.007159593049436808, "timestamp": "2025-09-30 22:12:53.377858", "step": 2108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:53.433335", "step": 2108, "epoch": 3 }, { "type": "loss", "content": 0.012715340591967106, "timestamp": "2025-09-30 22:12:53.435844", "step": 2109, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:12:54.807881", "step": 2109, "epoch": 3 }, { "type": "pplx", "content": 31717602.29605612, "timestamp": "2025-09-30 22:12:54.810448", "step": 2109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:54.866666", "step": 2109, "epoch": 3 }, { "type": "loss", "content": 0.0069103785790503025, "timestamp": "2025-09-30 22:12:54.870626", "step": 2110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:54.925774", "step": 2110, "epoch": 3 }, { "type": "loss", "content": 0.0004411868576426059, "timestamp": "2025-09-30 22:12:54.928538", "step": 2111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:54.986256", "step": 2111, "epoch": 3 }, { "type": "loss", "content": 0.02559594437479973, "timestamp": "2025-09-30 22:12:54.994525", "step": 2112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:55.056307", "step": 2112, "epoch": 3 }, { "type": "loss", "content": 0.006536331493407488, "timestamp": "2025-09-30 22:12:55.061762", "step": 2113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:55.119574", "step": 2113, "epoch": 3 }, { "type": "loss", "content": 0.034199222922325134, "timestamp": "2025-09-30 22:12:55.121736", "step": 2114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.184055", "step": 2114, "epoch": 3 }, { "type": "loss", "content": 0.0037060887552797794, "timestamp": "2025-09-30 22:12:55.190340", "step": 2115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.246418", "step": 2115, "epoch": 3 }, { "type": "loss", "content": 0.010455816984176636, "timestamp": "2025-09-30 22:12:55.257495", "step": 2116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.323363", "step": 2116, "epoch": 3 }, { "type": "loss", "content": 0.009131490252912045, "timestamp": "2025-09-30 22:12:55.329740", "step": 2117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:55.385535", "step": 2117, "epoch": 3 }, { "type": "loss", "content": 0.028637733310461044, "timestamp": "2025-09-30 22:12:55.388119", "step": 2118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.462086", "step": 2118, "epoch": 3 }, { "type": "loss", "content": 0.0017552494537085295, "timestamp": "2025-09-30 22:12:55.464428", "step": 2119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.524997", "step": 2119, "epoch": 3 }, { "type": "loss", "content": 0.0008230188977904618, "timestamp": "2025-09-30 22:12:55.532668", "step": 2120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:55.586377", "step": 2120, "epoch": 3 }, { "type": "loss", "content": 0.001255502225831151, "timestamp": "2025-09-30 22:12:55.599866", "step": 2121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:55.687367", "step": 2121, "epoch": 3 }, { "type": "loss", "content": 0.03605801239609718, "timestamp": "2025-09-30 22:12:55.689607", "step": 2122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.780220", "step": 2122, "epoch": 3 }, { "type": "loss", "content": 0.02137705311179161, "timestamp": "2025-09-30 22:12:55.784688", "step": 2123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.876184", "step": 2123, "epoch": 3 }, { "type": "loss", "content": 0.00452017318457365, "timestamp": "2025-09-30 22:12:55.885171", "step": 2124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:55.977205", "step": 2124, "epoch": 3 }, { "type": "loss", "content": 0.022074688225984573, "timestamp": "2025-09-30 22:12:55.979429", "step": 2125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:56.069986", "step": 2125, "epoch": 3 }, { "type": "loss", "content": 0.01156954187899828, "timestamp": "2025-09-30 22:12:56.072411", "step": 2126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.160302", "step": 2126, "epoch": 3 }, { "type": "loss", "content": 0.01876743696630001, "timestamp": "2025-09-30 22:12:56.162722", "step": 2127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.258051", "step": 2127, "epoch": 3 }, { "type": "loss", "content": 0.018221214413642883, "timestamp": "2025-09-30 22:12:56.268989", "step": 2128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.343166", "step": 2128, "epoch": 3 }, { "type": "loss", "content": 0.011680861935019493, "timestamp": "2025-09-30 22:12:56.345766", "step": 2129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:56.427254", "step": 2129, "epoch": 3 }, { "type": "loss", "content": 0.007095720618963242, "timestamp": "2025-09-30 22:12:56.431297", "step": 2130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.498838", "step": 2130, "epoch": 3 }, { "type": "loss", "content": 0.004647306632250547, "timestamp": "2025-09-30 22:12:56.501718", "step": 2131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.576126", "step": 2131, "epoch": 3 }, { "type": "loss", "content": 0.012296868488192558, "timestamp": "2025-09-30 22:12:56.582860", "step": 2132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:56.659773", "step": 2132, "epoch": 3 }, { "type": "loss", "content": 0.03337612748146057, "timestamp": "2025-09-30 22:12:56.663029", "step": 2133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:56.744957", "step": 2133, "epoch": 3 }, { "type": "loss", "content": 0.00937966164201498, "timestamp": "2025-09-30 22:12:56.748135", "step": 2134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.821695", "step": 2134, "epoch": 3 }, { "type": "loss", "content": 0.002799929352477193, "timestamp": "2025-09-30 22:12:56.825877", "step": 2135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:56.906853", "step": 2135, "epoch": 3 }, { "type": "loss", "content": 0.0039896611124277115, "timestamp": "2025-09-30 22:12:56.913216", "step": 2136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:56.988658", "step": 2136, "epoch": 3 }, { "type": "loss", "content": 0.00480355229228735, "timestamp": "2025-09-30 22:12:56.998934", "step": 2137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.075916", "step": 2137, "epoch": 3 }, { "type": "loss", "content": 0.006306238938122988, "timestamp": "2025-09-30 22:12:57.078555", "step": 2138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.158792", "step": 2138, "epoch": 3 }, { "type": "loss", "content": 0.0029865510296076536, "timestamp": "2025-09-30 22:12:57.166147", "step": 2139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.241108", "step": 2139, "epoch": 3 }, { "type": "loss", "content": 0.00501589709892869, "timestamp": "2025-09-30 22:12:57.252081", "step": 2140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.309098", "step": 2140, "epoch": 3 }, { "type": "loss", "content": 0.0062422240152955055, "timestamp": "2025-09-30 22:12:57.311877", "step": 2141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.370622", "step": 2141, "epoch": 3 }, { "type": "loss", "content": 0.014008880592882633, "timestamp": "2025-09-30 22:12:57.373707", "step": 2142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.446870", "step": 2142, "epoch": 3 }, { "type": "loss", "content": 0.0016020223265513778, "timestamp": "2025-09-30 22:12:57.450732", "step": 2143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.506223", "step": 2143, "epoch": 3 }, { "type": "loss", "content": 0.007073293440043926, "timestamp": "2025-09-30 22:12:57.520776", "step": 2144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.581482", "step": 2144, "epoch": 3 }, { "type": "loss", "content": 0.006945033557713032, "timestamp": "2025-09-30 22:12:57.585496", "step": 2145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.640324", "step": 2145, "epoch": 3 }, { "type": "loss", "content": 0.0017107333987951279, "timestamp": "2025-09-30 22:12:57.642964", "step": 2146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.697675", "step": 2146, "epoch": 3 }, { "type": "loss", "content": 0.020304743200540543, "timestamp": "2025-09-30 22:12:57.701181", "step": 2147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.755963", "step": 2147, "epoch": 3 }, { "type": "loss", "content": 0.004924300592392683, "timestamp": "2025-09-30 22:12:57.762052", "step": 2148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.825982", "step": 2148, "epoch": 3 }, { "type": "loss", "content": 0.008847953751683235, "timestamp": "2025-09-30 22:12:57.828336", "step": 2149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.895714", "step": 2149, "epoch": 3 }, { "type": "loss", "content": 0.0031908315140753984, "timestamp": "2025-09-30 22:12:57.898745", "step": 2150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:57.954645", "step": 2150, "epoch": 3 }, { "type": "loss", "content": 0.009887597523629665, "timestamp": "2025-09-30 22:12:57.957892", "step": 2151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:58.016821", "step": 2151, "epoch": 3 }, { "type": "loss", "content": 0.016261015087366104, "timestamp": "2025-09-30 22:12:58.023858", "step": 2152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.082714", "step": 2152, "epoch": 3 }, { "type": "loss", "content": 0.011811012402176857, "timestamp": "2025-09-30 22:12:58.086341", "step": 2153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:58.143616", "step": 2153, "epoch": 3 }, { "type": "loss", "content": 0.006635370198637247, "timestamp": "2025-09-30 22:12:58.146935", "step": 2154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:58.203868", "step": 2154, "epoch": 3 }, { "type": "loss", "content": 0.009925504215061665, "timestamp": "2025-09-30 22:12:58.206731", "step": 2155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.276892", "step": 2155, "epoch": 3 }, { "type": "loss", "content": 0.01277694571763277, "timestamp": "2025-09-30 22:12:58.284433", "step": 2156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.341811", "step": 2156, "epoch": 3 }, { "type": "loss", "content": 0.019755104556679726, "timestamp": "2025-09-30 22:12:58.345051", "step": 2157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:58.401533", "step": 2157, "epoch": 3 }, { "type": "loss", "content": 0.005314360372722149, "timestamp": "2025-09-30 22:12:58.404741", "step": 2158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:58.462362", "step": 2158, "epoch": 3 }, { "type": "loss", "content": 0.008464450016617775, "timestamp": "2025-09-30 22:12:58.465423", "step": 2159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.536232", "step": 2159, "epoch": 3 }, { "type": "loss", "content": 0.02357994019985199, "timestamp": "2025-09-30 22:12:58.543507", "step": 2160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.602367", "step": 2160, "epoch": 3 }, { "type": "loss", "content": 0.015320166014134884, "timestamp": "2025-09-30 22:12:58.609775", "step": 2161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:12:58.664326", "step": 2161, "epoch": 3 }, { "type": "loss", "content": 0.026179542765021324, "timestamp": "2025-09-30 22:12:58.667705", "step": 2162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:12:58.728789", "step": 2162, "epoch": 3 }, { "type": "loss", "content": 0.021009227260947227, "timestamp": "2025-09-30 22:12:58.732799", "step": 2163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.795597", "step": 2163, "epoch": 3 }, { "type": "loss", "content": 0.019618257880210876, "timestamp": "2025-09-30 22:12:58.801753", "step": 2164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:12:58.859468", "step": 2164, "epoch": 3 }, { "type": "loss", "content": 0.0046376558020710945, "timestamp": "2025-09-30 22:12:58.863573", "step": 2165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:12:58.919648", "step": 2165, "epoch": 3 }, { "type": "loss", "content": 0.013165593147277832, "timestamp": "2025-09-30 22:12:58.934748", "step": 2166, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:00.344251", "step": 2166, "epoch": 3 }, { "type": "pplx", "content": 28392377.416443832, "timestamp": "2025-09-30 22:13:00.348523", "step": 2166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:00.405249", "step": 2166, "epoch": 3 }, { "type": "loss", "content": 0.01954605057835579, "timestamp": "2025-09-30 22:13:00.413299", "step": 2167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:00.470031", "step": 2167, "epoch": 3 }, { "type": "loss", "content": 0.022829841822385788, "timestamp": "2025-09-30 22:13:00.484098", "step": 2168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:00.539409", "step": 2168, "epoch": 3 }, { "type": "loss", "content": 0.007014808710664511, "timestamp": "2025-09-30 22:13:00.541868", "step": 2169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:00.603810", "step": 2169, "epoch": 3 }, { "type": "loss", "content": 0.0007217395468614995, "timestamp": "2025-09-30 22:13:00.609050", "step": 2170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:00.668024", "step": 2170, "epoch": 3 }, { "type": "loss", "content": 0.0036041310522705317, "timestamp": "2025-09-30 22:13:00.671095", "step": 2171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:00.732930", "step": 2171, "epoch": 3 }, { "type": "loss", "content": 0.0017051756149157882, "timestamp": "2025-09-30 22:13:00.740229", "step": 2172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:00.802077", "step": 2172, "epoch": 3 }, { "type": "loss", "content": 0.01299409568309784, "timestamp": "2025-09-30 22:13:00.806365", "step": 2173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:00.860598", "step": 2173, "epoch": 3 }, { "type": "loss", "content": 0.008998853154480457, "timestamp": "2025-09-30 22:13:00.868540", "step": 2174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:00.923574", "step": 2174, "epoch": 3 }, { "type": "loss", "content": 0.022543568164110184, "timestamp": "2025-09-30 22:13:00.933624", "step": 2175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:00.995139", "step": 2175, "epoch": 3 }, { "type": "loss", "content": 0.00293608452193439, "timestamp": "2025-09-30 22:13:01.001532", "step": 2176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.063423", "step": 2176, "epoch": 3 }, { "type": "loss", "content": 0.007718019187450409, "timestamp": "2025-09-30 22:13:01.077359", "step": 2177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.139374", "step": 2177, "epoch": 3 }, { "type": "loss", "content": 0.05061681196093559, "timestamp": "2025-09-30 22:13:01.142240", "step": 2178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.205104", "step": 2178, "epoch": 3 }, { "type": "loss", "content": 0.0036929224152117968, "timestamp": "2025-09-30 22:13:01.209922", "step": 2179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:01.274566", "step": 2179, "epoch": 3 }, { "type": "loss", "content": 0.010988332331180573, "timestamp": "2025-09-30 22:13:01.282766", "step": 2180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:01.339141", "step": 2180, "epoch": 3 }, { "type": "loss", "content": 0.006843825336545706, "timestamp": "2025-09-30 22:13:01.345966", "step": 2181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.404084", "step": 2181, "epoch": 3 }, { "type": "loss", "content": 0.0031122539658099413, "timestamp": "2025-09-30 22:13:01.410275", "step": 2182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:01.467236", "step": 2182, "epoch": 3 }, { "type": "loss", "content": 0.012426759116351604, "timestamp": "2025-09-30 22:13:01.470013", "step": 2183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.527816", "step": 2183, "epoch": 3 }, { "type": "loss", "content": 0.011462002992630005, "timestamp": "2025-09-30 22:13:01.537702", "step": 2184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.603326", "step": 2184, "epoch": 3 }, { "type": "loss", "content": 0.013611538335680962, "timestamp": "2025-09-30 22:13:01.605682", "step": 2185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.662218", "step": 2185, "epoch": 3 }, { "type": "loss", "content": 0.00627383217215538, "timestamp": "2025-09-30 22:13:01.666345", "step": 2186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.723862", "step": 2186, "epoch": 3 }, { "type": "loss", "content": 0.016363272443413734, "timestamp": "2025-09-30 22:13:01.728741", "step": 2187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:01.787447", "step": 2187, "epoch": 3 }, { "type": "loss", "content": 0.01059199869632721, "timestamp": "2025-09-30 22:13:01.795897", "step": 2188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.851716", "step": 2188, "epoch": 3 }, { "type": "loss", "content": 0.007782274391502142, "timestamp": "2025-09-30 22:13:01.854373", "step": 2189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:01.909906", "step": 2189, "epoch": 3 }, { "type": "loss", "content": 0.004535790532827377, "timestamp": "2025-09-30 22:13:01.919337", "step": 2190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:01.983918", "step": 2190, "epoch": 3 }, { "type": "loss", "content": 0.028639093041419983, "timestamp": "2025-09-30 22:13:01.992130", "step": 2191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.047436", "step": 2191, "epoch": 3 }, { "type": "loss", "content": 0.0020749655086547136, "timestamp": "2025-09-30 22:13:02.053529", "step": 2192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.114527", "step": 2192, "epoch": 3 }, { "type": "loss", "content": 0.004414039198309183, "timestamp": "2025-09-30 22:13:02.120142", "step": 2193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:02.181647", "step": 2193, "epoch": 3 }, { "type": "loss", "content": 0.022147908806800842, "timestamp": "2025-09-30 22:13:02.193109", "step": 2194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:02.261747", "step": 2194, "epoch": 3 }, { "type": "loss", "content": 0.0016150318551808596, "timestamp": "2025-09-30 22:13:02.274714", "step": 2195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.342578", "step": 2195, "epoch": 3 }, { "type": "loss", "content": 0.04595841094851494, "timestamp": "2025-09-30 22:13:02.361835", "step": 2196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:02.421979", "step": 2196, "epoch": 3 }, { "type": "loss", "content": 0.02536730282008648, "timestamp": "2025-09-30 22:13:02.426233", "step": 2197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.480360", "step": 2197, "epoch": 3 }, { "type": "loss", "content": 0.0070744892582297325, "timestamp": "2025-09-30 22:13:02.489001", "step": 2198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.544743", "step": 2198, "epoch": 3 }, { "type": "loss", "content": 0.005618877708911896, "timestamp": "2025-09-30 22:13:02.547146", "step": 2199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:02.606597", "step": 2199, "epoch": 3 }, { "type": "loss", "content": 0.0013747888151556253, "timestamp": "2025-09-30 22:13:02.615649", "step": 2200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:02.670306", "step": 2200, "epoch": 3 }, { "type": "loss", "content": 0.00427739042788744, "timestamp": "2025-09-30 22:13:02.672784", "step": 2201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.749819", "step": 2201, "epoch": 3 }, { "type": "loss", "content": 0.012922806665301323, "timestamp": "2025-09-30 22:13:02.752972", "step": 2202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.811834", "step": 2202, "epoch": 3 }, { "type": "loss", "content": 0.0016593519831076264, "timestamp": "2025-09-30 22:13:02.817260", "step": 2203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:02.872153", "step": 2203, "epoch": 3 }, { "type": "loss", "content": 0.003395694075152278, "timestamp": "2025-09-30 22:13:02.881138", "step": 2204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.939609", "step": 2204, "epoch": 3 }, { "type": "loss", "content": 0.0013927072286605835, "timestamp": "2025-09-30 22:13:02.942410", "step": 2205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:02.998936", "step": 2205, "epoch": 3 }, { "type": "loss", "content": 0.017614727839827538, "timestamp": "2025-09-30 22:13:03.008428", "step": 2206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:03.080214", "step": 2206, "epoch": 3 }, { "type": "loss", "content": 0.0028198177460581064, "timestamp": "2025-09-30 22:13:03.083580", "step": 2207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.148948", "step": 2207, "epoch": 3 }, { "type": "loss", "content": 0.02004883624613285, "timestamp": "2025-09-30 22:13:03.156280", "step": 2208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.215571", "step": 2208, "epoch": 3 }, { "type": "loss", "content": 0.010212014429271221, "timestamp": "2025-09-30 22:13:03.221426", "step": 2209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.276640", "step": 2209, "epoch": 3 }, { "type": "loss", "content": 0.002033967524766922, "timestamp": "2025-09-30 22:13:03.283455", "step": 2210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.343725", "step": 2210, "epoch": 3 }, { "type": "loss", "content": 0.0013138767099007964, "timestamp": "2025-09-30 22:13:03.347390", "step": 2211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.413671", "step": 2211, "epoch": 3 }, { "type": "loss", "content": 0.0013939599739387631, "timestamp": "2025-09-30 22:13:03.432049", "step": 2212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.486732", "step": 2212, "epoch": 3 }, { "type": "loss", "content": 0.024434858933091164, "timestamp": "2025-09-30 22:13:03.489344", "step": 2213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:03.553859", "step": 2213, "epoch": 3 }, { "type": "loss", "content": 0.008440472185611725, "timestamp": "2025-09-30 22:13:03.563935", "step": 2214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.624857", "step": 2214, "epoch": 3 }, { "type": "loss", "content": 0.01175626926124096, "timestamp": "2025-09-30 22:13:03.628668", "step": 2215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:03.696209", "step": 2215, "epoch": 3 }, { "type": "loss", "content": 0.018027642741799355, "timestamp": "2025-09-30 22:13:03.706990", "step": 2216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.764429", "step": 2216, "epoch": 3 }, { "type": "loss", "content": 0.008521536365151405, "timestamp": "2025-09-30 22:13:03.772218", "step": 2217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:03.836575", "step": 2217, "epoch": 3 }, { "type": "loss", "content": 0.008888277225196362, "timestamp": "2025-09-30 22:13:03.840283", "step": 2218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:03.902344", "step": 2218, "epoch": 3 }, { "type": "loss", "content": 0.0012383210705593228, "timestamp": "2025-09-30 22:13:03.910926", "step": 2219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:03.977533", "step": 2219, "epoch": 3 }, { "type": "loss", "content": 0.0042413403280079365, "timestamp": "2025-09-30 22:13:03.984542", "step": 2220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:04.040130", "step": 2220, "epoch": 3 }, { "type": "loss", "content": 0.0010714359814301133, "timestamp": "2025-09-30 22:13:04.042791", "step": 2221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:04.107674", "step": 2221, "epoch": 3 }, { "type": "loss", "content": 0.006664037238806486, "timestamp": "2025-09-30 22:13:04.111000", "step": 2222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:04.170928", "step": 2222, "epoch": 3 }, { "type": "loss", "content": 0.0031583583913743496, "timestamp": "2025-09-30 22:13:04.173623", "step": 2223, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:05.490048", "step": 2223, "epoch": 3 }, { "type": "pplx", "content": 29344370.172395393, "timestamp": "2025-09-30 22:13:05.492733", "step": 2223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.546368", "step": 2223, "epoch": 3 }, { "type": "loss", "content": 0.006295633502304554, "timestamp": "2025-09-30 22:13:05.552730", "step": 2224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.607930", "step": 2224, "epoch": 3 }, { "type": "loss", "content": 0.003405241295695305, "timestamp": "2025-09-30 22:13:05.610697", "step": 2225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.667829", "step": 2225, "epoch": 3 }, { "type": "loss", "content": 0.002446610014885664, "timestamp": "2025-09-30 22:13:05.673988", "step": 2226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.729752", "step": 2226, "epoch": 3 }, { "type": "loss", "content": 0.017070619389414787, "timestamp": "2025-09-30 22:13:05.733291", "step": 2227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:05.789448", "step": 2227, "epoch": 3 }, { "type": "loss", "content": 0.01944059133529663, "timestamp": "2025-09-30 22:13:05.796101", "step": 2228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.852039", "step": 2228, "epoch": 3 }, { "type": "loss", "content": 0.0016683695139363408, "timestamp": "2025-09-30 22:13:05.857473", "step": 2229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.925067", "step": 2229, "epoch": 3 }, { "type": "loss", "content": 0.0019555925391614437, "timestamp": "2025-09-30 22:13:05.929845", "step": 2230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:05.989548", "step": 2230, "epoch": 3 }, { "type": "loss", "content": 0.00015620069461874664, "timestamp": "2025-09-30 22:13:05.991628", "step": 2231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:06.052663", "step": 2231, "epoch": 3 }, { "type": "loss", "content": 0.01360904611647129, "timestamp": "2025-09-30 22:13:06.063942", "step": 2232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.127300", "step": 2232, "epoch": 3 }, { "type": "loss", "content": 0.0014074391219764948, "timestamp": "2025-09-30 22:13:06.132756", "step": 2233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.196856", "step": 2233, "epoch": 3 }, { "type": "loss", "content": 0.002081622602418065, "timestamp": "2025-09-30 22:13:06.204891", "step": 2234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.262721", "step": 2234, "epoch": 3 }, { "type": "loss", "content": 0.0013570208102464676, "timestamp": "2025-09-30 22:13:06.273367", "step": 2235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:06.341281", "step": 2235, "epoch": 3 }, { "type": "loss", "content": 0.0063094040378928185, "timestamp": "2025-09-30 22:13:06.353285", "step": 2236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:06.425103", "step": 2236, "epoch": 3 }, { "type": "loss", "content": 0.0011793047888204455, "timestamp": "2025-09-30 22:13:06.432018", "step": 2237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.494162", "step": 2237, "epoch": 3 }, { "type": "loss", "content": 0.024860413745045662, "timestamp": "2025-09-30 22:13:06.499047", "step": 2238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:06.571763", "step": 2238, "epoch": 3 }, { "type": "loss", "content": 0.029842732474207878, "timestamp": "2025-09-30 22:13:06.581059", "step": 2239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:06.652497", "step": 2239, "epoch": 3 }, { "type": "loss", "content": 0.0033206476364284754, "timestamp": "2025-09-30 22:13:06.664027", "step": 2240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.725036", "step": 2240, "epoch": 3 }, { "type": "loss", "content": 0.0032124409917742014, "timestamp": "2025-09-30 22:13:06.729256", "step": 2241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:06.797711", "step": 2241, "epoch": 3 }, { "type": "loss", "content": 0.0014608422061428428, "timestamp": "2025-09-30 22:13:06.820254", "step": 2242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.893327", "step": 2242, "epoch": 3 }, { "type": "loss", "content": 0.0015411525964736938, "timestamp": "2025-09-30 22:13:06.909198", "step": 2243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:06.972246", "step": 2243, "epoch": 3 }, { "type": "loss", "content": 0.00187637226190418, "timestamp": "2025-09-30 22:13:06.983547", "step": 2244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:07.038825", "step": 2244, "epoch": 3 }, { "type": "loss", "content": 0.0047594718635082245, "timestamp": "2025-09-30 22:13:07.045563", "step": 2245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:07.105710", "step": 2245, "epoch": 3 }, { "type": "loss", "content": 0.003375373315066099, "timestamp": "2025-09-30 22:13:07.108166", "step": 2246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.162366", "step": 2246, "epoch": 3 }, { "type": "loss", "content": 0.0013589292066171765, "timestamp": "2025-09-30 22:13:07.165331", "step": 2247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.220888", "step": 2247, "epoch": 3 }, { "type": "loss", "content": 0.016323648393154144, "timestamp": "2025-09-30 22:13:07.229246", "step": 2248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.283426", "step": 2248, "epoch": 3 }, { "type": "loss", "content": 0.006245364900678396, "timestamp": "2025-09-30 22:13:07.292940", "step": 2249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.349128", "step": 2249, "epoch": 3 }, { "type": "loss", "content": 0.0017451593885198236, "timestamp": "2025-09-30 22:13:07.352875", "step": 2250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.412427", "step": 2250, "epoch": 3 }, { "type": "loss", "content": 0.007775729056447744, "timestamp": "2025-09-30 22:13:07.423227", "step": 2251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.480814", "step": 2251, "epoch": 3 }, { "type": "loss", "content": 0.02229670248925686, "timestamp": "2025-09-30 22:13:07.491348", "step": 2252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.553274", "step": 2252, "epoch": 3 }, { "type": "loss", "content": 0.0005842237151227891, "timestamp": "2025-09-30 22:13:07.555760", "step": 2253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:07.624606", "step": 2253, "epoch": 3 }, { "type": "loss", "content": 0.0024967549834400415, "timestamp": "2025-09-30 22:13:07.627473", "step": 2254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:07.682185", "step": 2254, "epoch": 3 }, { "type": "loss", "content": 0.00876485276967287, "timestamp": "2025-09-30 22:13:07.685725", "step": 2255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:07.748598", "step": 2255, "epoch": 3 }, { "type": "loss", "content": 0.00036741181975230575, "timestamp": "2025-09-30 22:13:07.754726", "step": 2256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:07.814528", "step": 2256, "epoch": 3 }, { "type": "loss", "content": 0.0012112419353798032, "timestamp": "2025-09-30 22:13:07.818293", "step": 2257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:07.892341", "step": 2257, "epoch": 3 }, { "type": "loss", "content": 0.0117741534486413, "timestamp": "2025-09-30 22:13:07.898564", "step": 2258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:07.956914", "step": 2258, "epoch": 3 }, { "type": "loss", "content": 0.001920665497891605, "timestamp": "2025-09-30 22:13:07.968074", "step": 2259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:08.024271", "step": 2259, "epoch": 3 }, { "type": "loss", "content": 0.0017800560453906655, "timestamp": "2025-09-30 22:13:08.032844", "step": 2260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.087032", "step": 2260, "epoch": 3 }, { "type": "loss", "content": 0.007017010357230902, "timestamp": "2025-09-30 22:13:08.089549", "step": 2261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.146410", "step": 2261, "epoch": 3 }, { "type": "loss", "content": 0.0003600542258936912, "timestamp": "2025-09-30 22:13:08.154264", "step": 2262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.211686", "step": 2262, "epoch": 3 }, { "type": "loss", "content": 0.009401796385645866, "timestamp": "2025-09-30 22:13:08.214506", "step": 2263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:08.270432", "step": 2263, "epoch": 3 }, { "type": "loss", "content": 0.004206747282296419, "timestamp": "2025-09-30 22:13:08.281054", "step": 2264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.339524", "step": 2264, "epoch": 3 }, { "type": "loss", "content": 0.00021178685710765421, "timestamp": "2025-09-30 22:13:08.341888", "step": 2265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:08.397581", "step": 2265, "epoch": 3 }, { "type": "loss", "content": 0.00048541155410930514, "timestamp": "2025-09-30 22:13:08.402602", "step": 2266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.458086", "step": 2266, "epoch": 3 }, { "type": "loss", "content": 0.003917259629815817, "timestamp": "2025-09-30 22:13:08.464337", "step": 2267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:08.523381", "step": 2267, "epoch": 3 }, { "type": "loss", "content": 0.003979508299380541, "timestamp": "2025-09-30 22:13:08.530734", "step": 2268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.597171", "step": 2268, "epoch": 3 }, { "type": "loss", "content": 0.0003099239547736943, "timestamp": "2025-09-30 22:13:08.599264", "step": 2269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.652680", "step": 2269, "epoch": 3 }, { "type": "loss", "content": 0.000424488156568259, "timestamp": "2025-09-30 22:13:08.662711", "step": 2270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.715873", "step": 2270, "epoch": 3 }, { "type": "loss", "content": 0.007150634657591581, "timestamp": "2025-09-30 22:13:08.718812", "step": 2271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.783543", "step": 2271, "epoch": 3 }, { "type": "loss", "content": 0.0024335901252925396, "timestamp": "2025-09-30 22:13:08.790618", "step": 2272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.846206", "step": 2272, "epoch": 3 }, { "type": "loss", "content": 0.01159227080643177, "timestamp": "2025-09-30 22:13:08.849451", "step": 2273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.917310", "step": 2273, "epoch": 3 }, { "type": "loss", "content": 0.005326881073415279, "timestamp": "2025-09-30 22:13:08.926533", "step": 2274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:08.983661", "step": 2274, "epoch": 3 }, { "type": "loss", "content": 0.0012480862205848098, "timestamp": "2025-09-30 22:13:08.992832", "step": 2275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:09.050359", "step": 2275, "epoch": 3 }, { "type": "loss", "content": 0.008051712065935135, "timestamp": "2025-09-30 22:13:09.056973", "step": 2276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:09.120010", "step": 2276, "epoch": 3 }, { "type": "loss", "content": 0.0012699085054919124, "timestamp": "2025-09-30 22:13:09.125755", "step": 2277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:09.180682", "step": 2277, "epoch": 3 }, { "type": "loss", "content": 0.006048406939953566, "timestamp": "2025-09-30 22:13:09.183726", "step": 2278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:09.250575", "step": 2278, "epoch": 3 }, { "type": "loss", "content": 0.004453370813280344, "timestamp": "2025-09-30 22:13:09.253346", "step": 2279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:09.308955", "step": 2279, "epoch": 3 }, { "type": "loss", "content": 0.002166403690353036, "timestamp": "2025-09-30 22:13:09.315558", "step": 2280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:10.738278", "step": 2280, "epoch": 3 }, { "type": "pplx", "content": 31190960.103936635, "timestamp": "2025-09-30 22:13:10.740643", "step": 2280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:10.792576", "step": 2280, "epoch": 3 }, { "type": "loss", "content": 0.00041843479266390204, "timestamp": "2025-09-30 22:13:10.795117", "step": 2281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:10.859013", "step": 2281, "epoch": 3 }, { "type": "loss", "content": 0.006251118145883083, "timestamp": "2025-09-30 22:13:10.862684", "step": 2282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:10.926799", "step": 2282, "epoch": 3 }, { "type": "loss", "content": 0.0009974894346669316, "timestamp": "2025-09-30 22:13:10.929266", "step": 2283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:10.989116", "step": 2283, "epoch": 3 }, { "type": "loss", "content": 0.0032588716130703688, "timestamp": "2025-09-30 22:13:10.997818", "step": 2284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.061570", "step": 2284, "epoch": 3 }, { "type": "loss", "content": 0.005364909302443266, "timestamp": "2025-09-30 22:13:11.064019", "step": 2285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.122069", "step": 2285, "epoch": 3 }, { "type": "loss", "content": 8.350759890163317e-05, "timestamp": "2025-09-30 22:13:11.124572", "step": 2286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.190995", "step": 2286, "epoch": 3 }, { "type": "loss", "content": 0.0043342201970517635, "timestamp": "2025-09-30 22:13:11.195044", "step": 2287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:11.268671", "step": 2287, "epoch": 3 }, { "type": "loss", "content": 0.0011432368773967028, "timestamp": "2025-09-30 22:13:11.275236", "step": 2288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:11.339399", "step": 2288, "epoch": 3 }, { "type": "loss", "content": 0.0010339574655517936, "timestamp": "2025-09-30 22:13:11.342362", "step": 2289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.403531", "step": 2289, "epoch": 3 }, { "type": "loss", "content": 0.0022677092347294092, "timestamp": "2025-09-30 22:13:11.409800", "step": 2290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.472241", "step": 2290, "epoch": 3 }, { "type": "loss", "content": 0.004064720589667559, "timestamp": "2025-09-30 22:13:11.474953", "step": 2291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:11.536884", "step": 2291, "epoch": 3 }, { "type": "loss", "content": 0.027135172858834267, "timestamp": "2025-09-30 22:13:11.543184", "step": 2292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.602293", "step": 2292, "epoch": 3 }, { "type": "loss", "content": 0.0008837772184051573, "timestamp": "2025-09-30 22:13:11.608386", "step": 2293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:11.665675", "step": 2293, "epoch": 3 }, { "type": "loss", "content": 0.011004587635397911, "timestamp": "2025-09-30 22:13:11.668035", "step": 2294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.727743", "step": 2294, "epoch": 3 }, { "type": "loss", "content": 0.004693236667662859, "timestamp": "2025-09-30 22:13:11.730020", "step": 2295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.788422", "step": 2295, "epoch": 3 }, { "type": "loss", "content": 0.0006029639625921845, "timestamp": "2025-09-30 22:13:11.798640", "step": 2296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.854358", "step": 2296, "epoch": 3 }, { "type": "loss", "content": 0.001073610968887806, "timestamp": "2025-09-30 22:13:11.856714", "step": 2297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.915935", "step": 2297, "epoch": 3 }, { "type": "loss", "content": 0.0027264878153800964, "timestamp": "2025-09-30 22:13:11.920656", "step": 2298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:11.980816", "step": 2298, "epoch": 3 }, { "type": "loss", "content": 0.0007334630936384201, "timestamp": "2025-09-30 22:13:11.985247", "step": 2299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.059254", "step": 2299, "epoch": 3 }, { "type": "loss", "content": 0.0011301154736429453, "timestamp": "2025-09-30 22:13:12.065752", "step": 2300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.123890", "step": 2300, "epoch": 3 }, { "type": "loss", "content": 0.00015225273091346025, "timestamp": "2025-09-30 22:13:12.128224", "step": 2301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.196012", "step": 2301, "epoch": 3 }, { "type": "loss", "content": 0.000535293947905302, "timestamp": "2025-09-30 22:13:12.198299", "step": 2302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.252848", "step": 2302, "epoch": 3 }, { "type": "loss", "content": 0.004955708514899015, "timestamp": "2025-09-30 22:13:12.260839", "step": 2303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.319753", "step": 2303, "epoch": 3 }, { "type": "loss", "content": 0.00846810918301344, "timestamp": "2025-09-30 22:13:12.328509", "step": 2304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.383083", "step": 2304, "epoch": 3 }, { "type": "loss", "content": 6.449552165577188e-05, "timestamp": "2025-09-30 22:13:12.388975", "step": 2305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.465214", "step": 2305, "epoch": 3 }, { "type": "loss", "content": 0.004094495438039303, "timestamp": "2025-09-30 22:13:12.468338", "step": 2306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:12.524076", "step": 2306, "epoch": 3 }, { "type": "loss", "content": 0.0043173558078706264, "timestamp": "2025-09-30 22:13:12.531143", "step": 2307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:12.586832", "step": 2307, "epoch": 3 }, { "type": "loss", "content": 0.013621537014842033, "timestamp": "2025-09-30 22:13:12.593105", "step": 2308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:12.647208", "step": 2308, "epoch": 3 }, { "type": "loss", "content": 0.00045176432467997074, "timestamp": "2025-09-30 22:13:12.651462", "step": 2309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.706835", "step": 2309, "epoch": 3 }, { "type": "loss", "content": 0.00153114995919168, "timestamp": "2025-09-30 22:13:12.709825", "step": 2310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.768571", "step": 2310, "epoch": 3 }, { "type": "loss", "content": 0.0002749506966210902, "timestamp": "2025-09-30 22:13:12.773973", "step": 2311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.830788", "step": 2311, "epoch": 3 }, { "type": "loss", "content": 0.0001482378429500386, "timestamp": "2025-09-30 22:13:12.836862", "step": 2312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.899423", "step": 2312, "epoch": 3 }, { "type": "loss", "content": 0.0001635663356864825, "timestamp": "2025-09-30 22:13:12.905845", "step": 2313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:12.972178", "step": 2313, "epoch": 3 }, { "type": "loss", "content": 2.553769627411384e-05, "timestamp": "2025-09-30 22:13:12.974513", "step": 2314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.030917", "step": 2314, "epoch": 3 }, { "type": "loss", "content": 0.014685734175145626, "timestamp": "2025-09-30 22:13:13.033978", "step": 2315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.092800", "step": 2315, "epoch": 3 }, { "type": "loss", "content": 7.750854274490848e-05, "timestamp": "2025-09-30 22:13:13.105173", "step": 2316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:13.160613", "step": 2316, "epoch": 3 }, { "type": "loss", "content": 0.0010388302616775036, "timestamp": "2025-09-30 22:13:13.164054", "step": 2317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.231936", "step": 2317, "epoch": 3 }, { "type": "loss", "content": 0.0024059859570115805, "timestamp": "2025-09-30 22:13:13.239512", "step": 2318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:13.295025", "step": 2318, "epoch": 3 }, { "type": "loss", "content": 0.00040224703843705356, "timestamp": "2025-09-30 22:13:13.304151", "step": 2319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.366702", "step": 2319, "epoch": 3 }, { "type": "loss", "content": 0.000600070517975837, "timestamp": "2025-09-30 22:13:13.372841", "step": 2320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:13.428888", "step": 2320, "epoch": 3 }, { "type": "loss", "content": 0.043553676456213, "timestamp": "2025-09-30 22:13:13.432384", "step": 2321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.495898", "step": 2321, "epoch": 3 }, { "type": "loss", "content": 0.01901986077427864, "timestamp": "2025-09-30 22:13:13.498854", "step": 2322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.566548", "step": 2322, "epoch": 3 }, { "type": "loss", "content": 0.020047681406140327, "timestamp": "2025-09-30 22:13:13.569703", "step": 2323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:13.625179", "step": 2323, "epoch": 3 }, { "type": "loss", "content": 0.00016043984214775264, "timestamp": "2025-09-30 22:13:13.632855", "step": 2324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.696114", "step": 2324, "epoch": 3 }, { "type": "loss", "content": 0.012856409884989262, "timestamp": "2025-09-30 22:13:13.699713", "step": 2325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:13.762802", "step": 2325, "epoch": 3 }, { "type": "loss", "content": 0.004349694121629, "timestamp": "2025-09-30 22:13:13.765900", "step": 2326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.825731", "step": 2326, "epoch": 3 }, { "type": "loss", "content": 0.0007128869765438139, "timestamp": "2025-09-30 22:13:13.829485", "step": 2327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.886792", "step": 2327, "epoch": 3 }, { "type": "loss", "content": 0.006236379034817219, "timestamp": "2025-09-30 22:13:13.901117", "step": 2328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:13.960939", "step": 2328, "epoch": 3 }, { "type": "loss", "content": 0.0003488468355499208, "timestamp": "2025-09-30 22:13:13.964455", "step": 2329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:14.036143", "step": 2329, "epoch": 3 }, { "type": "loss", "content": 0.00010532321175560355, "timestamp": "2025-09-30 22:13:14.039724", "step": 2330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:14.099722", "step": 2330, "epoch": 3 }, { "type": "loss", "content": 0.03490881994366646, "timestamp": "2025-09-30 22:13:14.103491", "step": 2331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:14.168611", "step": 2331, "epoch": 3 }, { "type": "loss", "content": 0.013101531192660332, "timestamp": "2025-09-30 22:13:14.179820", "step": 2332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:14.240568", "step": 2332, "epoch": 3 }, { "type": "loss", "content": 0.0005494463839568198, "timestamp": "2025-09-30 22:13:14.244365", "step": 2333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:14.304060", "step": 2333, "epoch": 3 }, { "type": "loss", "content": 0.0002779501664917916, "timestamp": "2025-09-30 22:13:14.315947", "step": 2334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:14.375221", "step": 2334, "epoch": 3 }, { "type": "loss", "content": 0.0017243318725377321, "timestamp": "2025-09-30 22:13:14.378038", "step": 2335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:14.440264", "step": 2335, "epoch": 3 }, { "type": "loss", "content": 0.0026360393967479467, "timestamp": "2025-09-30 22:13:14.452840", "step": 2336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:14.513305", "step": 2336, "epoch": 3 }, { "type": "loss", "content": 0.0022121057845652103, "timestamp": "2025-09-30 22:13:14.521054", "step": 2337, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:15.906861", "step": 2337, "epoch": 3 }, { "type": "pplx", "content": 35685924.762230136, "timestamp": "2025-09-30 22:13:15.911272", "step": 2337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:15.969547", "step": 2337, "epoch": 3 }, { "type": "loss", "content": 0.0001224317093146965, "timestamp": "2025-09-30 22:13:15.973320", "step": 2338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.041128", "step": 2338, "epoch": 3 }, { "type": "loss", "content": 0.0006064993212930858, "timestamp": "2025-09-30 22:13:16.044973", "step": 2339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.100730", "step": 2339, "epoch": 3 }, { "type": "loss", "content": 0.003981334622949362, "timestamp": "2025-09-30 22:13:16.107765", "step": 2340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:16.166753", "step": 2340, "epoch": 3 }, { "type": "loss", "content": 0.00029052264289930463, "timestamp": "2025-09-30 22:13:16.171192", "step": 2341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:16.226872", "step": 2341, "epoch": 3 }, { "type": "loss", "content": 0.010386792942881584, "timestamp": "2025-09-30 22:13:16.230245", "step": 2342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.294054", "step": 2342, "epoch": 3 }, { "type": "loss", "content": 5.578704076469876e-05, "timestamp": "2025-09-30 22:13:16.297024", "step": 2343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.356213", "step": 2343, "epoch": 3 }, { "type": "loss", "content": 0.0007381403702311218, "timestamp": "2025-09-30 22:13:16.369322", "step": 2344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.426141", "step": 2344, "epoch": 3 }, { "type": "loss", "content": 0.03852478787302971, "timestamp": "2025-09-30 22:13:16.430172", "step": 2345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:16.489012", "step": 2345, "epoch": 3 }, { "type": "loss", "content": 0.0006424127495847642, "timestamp": "2025-09-30 22:13:16.499829", "step": 2346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.556480", "step": 2346, "epoch": 3 }, { "type": "loss", "content": 9.567866800352931e-05, "timestamp": "2025-09-30 22:13:16.560414", "step": 2347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.615496", "step": 2347, "epoch": 3 }, { "type": "loss", "content": 0.009399794973433018, "timestamp": "2025-09-30 22:13:16.630284", "step": 2348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.693055", "step": 2348, "epoch": 3 }, { "type": "loss", "content": 0.0003211422299500555, "timestamp": "2025-09-30 22:13:16.697092", "step": 2349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:16.757223", "step": 2349, "epoch": 3 }, { "type": "loss", "content": 0.0036444012075662613, "timestamp": "2025-09-30 22:13:16.762122", "step": 2350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:16.821943", "step": 2350, "epoch": 3 }, { "type": "loss", "content": 0.026028577238321304, "timestamp": "2025-09-30 22:13:16.832804", "step": 2351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.901109", "step": 2351, "epoch": 3 }, { "type": "loss", "content": 0.0048171780072152615, "timestamp": "2025-09-30 22:13:16.908059", "step": 2352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:16.962841", "step": 2352, "epoch": 3 }, { "type": "loss", "content": 3.750595715246163e-05, "timestamp": "2025-09-30 22:13:16.965327", "step": 2353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:17.025383", "step": 2353, "epoch": 3 }, { "type": "loss", "content": 0.008849648758769035, "timestamp": "2025-09-30 22:13:17.028464", "step": 2354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:17.092237", "step": 2354, "epoch": 3 }, { "type": "loss", "content": 6.525561184389517e-05, "timestamp": "2025-09-30 22:13:17.100739", "step": 2355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.163544", "step": 2355, "epoch": 3 }, { "type": "loss", "content": 9.634840534999967e-05, "timestamp": "2025-09-30 22:13:17.171199", "step": 2356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.228897", "step": 2356, "epoch": 3 }, { "type": "loss", "content": 0.0001659039407968521, "timestamp": "2025-09-30 22:13:17.233707", "step": 2357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:17.301413", "step": 2357, "epoch": 3 }, { "type": "loss", "content": 8.300376066472381e-05, "timestamp": "2025-09-30 22:13:17.306270", "step": 2358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.365873", "step": 2358, "epoch": 3 }, { "type": "loss", "content": 0.0006306396098807454, "timestamp": "2025-09-30 22:13:17.369232", "step": 2359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.436726", "step": 2359, "epoch": 3 }, { "type": "loss", "content": 0.012602239847183228, "timestamp": "2025-09-30 22:13:17.443521", "step": 2360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:17.504255", "step": 2360, "epoch": 3 }, { "type": "loss", "content": 6.385560845956206e-05, "timestamp": "2025-09-30 22:13:17.513534", "step": 2361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.570422", "step": 2361, "epoch": 3 }, { "type": "loss", "content": 0.0011304274667054415, "timestamp": "2025-09-30 22:13:17.573485", "step": 2362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.631234", "step": 2362, "epoch": 3 }, { "type": "loss", "content": 0.0038645591121166945, "timestamp": "2025-09-30 22:13:17.634878", "step": 2363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.691433", "step": 2363, "epoch": 3 }, { "type": "loss", "content": 0.007124242372810841, "timestamp": "2025-09-30 22:13:17.704027", "step": 2364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.764565", "step": 2364, "epoch": 3 }, { "type": "loss", "content": 0.0018359085079282522, "timestamp": "2025-09-30 22:13:17.773024", "step": 2365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.833422", "step": 2365, "epoch": 3 }, { "type": "loss", "content": 0.014947568997740746, "timestamp": "2025-09-30 22:13:17.835913", "step": 2366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:17.893406", "step": 2366, "epoch": 3 }, { "type": "loss", "content": 0.00046386828762479126, "timestamp": "2025-09-30 22:13:17.896906", "step": 2367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:17.960708", "step": 2367, "epoch": 3 }, { "type": "loss", "content": 0.008881064131855965, "timestamp": "2025-09-30 22:13:17.968071", "step": 2368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.023659", "step": 2368, "epoch": 3 }, { "type": "loss", "content": 0.0001027476173476316, "timestamp": "2025-09-30 22:13:18.026536", "step": 2369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.086101", "step": 2369, "epoch": 3 }, { "type": "loss", "content": 5.726577728637494e-05, "timestamp": "2025-09-30 22:13:18.089037", "step": 2370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.152428", "step": 2370, "epoch": 3 }, { "type": "loss", "content": 0.0024566694628447294, "timestamp": "2025-09-30 22:13:18.155698", "step": 2371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.215358", "step": 2371, "epoch": 3 }, { "type": "loss", "content": 0.002534597646445036, "timestamp": "2025-09-30 22:13:18.222732", "step": 2372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.287719", "step": 2372, "epoch": 3 }, { "type": "loss", "content": 0.04354690760374069, "timestamp": "2025-09-30 22:13:18.292228", "step": 2373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.348467", "step": 2373, "epoch": 3 }, { "type": "loss", "content": 0.017217475920915604, "timestamp": "2025-09-30 22:13:18.352618", "step": 2374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.411457", "step": 2374, "epoch": 3 }, { "type": "loss", "content": 0.007565335836261511, "timestamp": "2025-09-30 22:13:18.415699", "step": 2375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.472300", "step": 2375, "epoch": 3 }, { "type": "loss", "content": 0.026072219014167786, "timestamp": "2025-09-30 22:13:18.480814", "step": 2376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.548341", "step": 2376, "epoch": 3 }, { "type": "loss", "content": 0.0003426198090892285, "timestamp": "2025-09-30 22:13:18.552600", "step": 2377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.609365", "step": 2377, "epoch": 3 }, { "type": "loss", "content": 0.07469062507152557, "timestamp": "2025-09-30 22:13:18.613435", "step": 2378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:18.677596", "step": 2378, "epoch": 3 }, { "type": "loss", "content": 0.023098181933164597, "timestamp": "2025-09-30 22:13:18.680446", "step": 2379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.742608", "step": 2379, "epoch": 3 }, { "type": "loss", "content": 0.0065413895063102245, "timestamp": "2025-09-30 22:13:18.749044", "step": 2380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.804305", "step": 2380, "epoch": 3 }, { "type": "loss", "content": 0.009016537107527256, "timestamp": "2025-09-30 22:13:18.808806", "step": 2381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:18.872537", "step": 2381, "epoch": 3 }, { "type": "loss", "content": 0.005769102368503809, "timestamp": "2025-09-30 22:13:18.883286", "step": 2382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.939108", "step": 2382, "epoch": 3 }, { "type": "loss", "content": 0.03586021065711975, "timestamp": "2025-09-30 22:13:18.943050", "step": 2383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:18.999422", "step": 2383, "epoch": 3 }, { "type": "loss", "content": 0.032269734889268875, "timestamp": "2025-09-30 22:13:19.006351", "step": 2384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:19.069159", "step": 2384, "epoch": 3 }, { "type": "loss", "content": 0.00165548047516495, "timestamp": "2025-09-30 22:13:19.072039", "step": 2385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.131862", "step": 2385, "epoch": 3 }, { "type": "loss", "content": 0.0024231227580457926, "timestamp": "2025-09-30 22:13:19.134492", "step": 2386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:19.198543", "step": 2386, "epoch": 3 }, { "type": "loss", "content": 0.04199283942580223, "timestamp": "2025-09-30 22:13:19.201655", "step": 2387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.257463", "step": 2387, "epoch": 3 }, { "type": "loss", "content": 0.03284038230776787, "timestamp": "2025-09-30 22:13:19.273136", "step": 2388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:19.330743", "step": 2388, "epoch": 3 }, { "type": "loss", "content": 0.0017338943434879184, "timestamp": "2025-09-30 22:13:19.333819", "step": 2389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.390251", "step": 2389, "epoch": 3 }, { "type": "loss", "content": 0.004067179746925831, "timestamp": "2025-09-30 22:13:19.407477", "step": 2390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:19.473602", "step": 2390, "epoch": 3 }, { "type": "loss", "content": 0.002591915661469102, "timestamp": "2025-09-30 22:13:19.482602", "step": 2391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.543242", "step": 2391, "epoch": 3 }, { "type": "loss", "content": 0.006193013396114111, "timestamp": "2025-09-30 22:13:19.550256", "step": 2392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.613204", "step": 2392, "epoch": 3 }, { "type": "loss", "content": 0.018228931352496147, "timestamp": "2025-09-30 22:13:19.616517", "step": 2393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:19.673910", "step": 2393, "epoch": 3 }, { "type": "loss", "content": 0.0011682414915412664, "timestamp": "2025-09-30 22:13:19.676994", "step": 2394, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:21.124634", "step": 2394, "epoch": 3 }, { "type": "pplx", "content": 32188811.96573708, "timestamp": "2025-09-30 22:13:21.128270", "step": 2394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.183488", "step": 2394, "epoch": 3 }, { "type": "loss", "content": 0.005388319492340088, "timestamp": "2025-09-30 22:13:21.187245", "step": 2395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.247593", "step": 2395, "epoch": 3 }, { "type": "loss", "content": 0.013586322776973248, "timestamp": "2025-09-30 22:13:21.254124", "step": 2396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.312845", "step": 2396, "epoch": 3 }, { "type": "loss", "content": 0.006401594262570143, "timestamp": "2025-09-30 22:13:21.321760", "step": 2397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.379109", "step": 2397, "epoch": 3 }, { "type": "loss", "content": 0.009359225630760193, "timestamp": "2025-09-30 22:13:21.381831", "step": 2398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:21.444723", "step": 2398, "epoch": 3 }, { "type": "loss", "content": 0.021020669490098953, "timestamp": "2025-09-30 22:13:21.448756", "step": 2399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.512288", "step": 2399, "epoch": 3 }, { "type": "loss", "content": 0.012792134657502174, "timestamp": "2025-09-30 22:13:21.518632", "step": 2400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.573224", "step": 2400, "epoch": 3 }, { "type": "loss", "content": 0.004996354691684246, "timestamp": "2025-09-30 22:13:21.575642", "step": 2401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:21.637750", "step": 2401, "epoch": 3 }, { "type": "loss", "content": 0.029606416821479797, "timestamp": "2025-09-30 22:13:21.642603", "step": 2402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.700319", "step": 2402, "epoch": 3 }, { "type": "loss", "content": 0.006963818334043026, "timestamp": "2025-09-30 22:13:21.703901", "step": 2403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.761290", "step": 2403, "epoch": 3 }, { "type": "loss", "content": 0.0031231886241585016, "timestamp": "2025-09-30 22:13:21.768141", "step": 2404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.832875", "step": 2404, "epoch": 3 }, { "type": "loss", "content": 0.005416753236204386, "timestamp": "2025-09-30 22:13:21.836985", "step": 2405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:21.901336", "step": 2405, "epoch": 3 }, { "type": "loss", "content": 0.007782486267387867, "timestamp": "2025-09-30 22:13:21.914547", "step": 2406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:21.973355", "step": 2406, "epoch": 3 }, { "type": "loss", "content": 0.013428935781121254, "timestamp": "2025-09-30 22:13:21.976497", "step": 2407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.032826", "step": 2407, "epoch": 3 }, { "type": "loss", "content": 0.016297975555062294, "timestamp": "2025-09-30 22:13:22.040439", "step": 2408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.100144", "step": 2408, "epoch": 3 }, { "type": "loss", "content": 0.00910912174731493, "timestamp": "2025-09-30 22:13:22.105263", "step": 2409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.160966", "step": 2409, "epoch": 3 }, { "type": "loss", "content": 0.011663010343909264, "timestamp": "2025-09-30 22:13:22.170743", "step": 2410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:22.228090", "step": 2410, "epoch": 3 }, { "type": "loss", "content": 0.010669506154954433, "timestamp": "2025-09-30 22:13:22.230710", "step": 2411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:22.289260", "step": 2411, "epoch": 3 }, { "type": "loss", "content": 0.009635014459490776, "timestamp": "2025-09-30 22:13:22.295350", "step": 2412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.355178", "step": 2412, "epoch": 3 }, { "type": "loss", "content": 0.006291415076702833, "timestamp": "2025-09-30 22:13:22.358455", "step": 2413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.416008", "step": 2413, "epoch": 3 }, { "type": "loss", "content": 0.005919110961258411, "timestamp": "2025-09-30 22:13:22.424105", "step": 2414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:22.480075", "step": 2414, "epoch": 3 }, { "type": "loss", "content": 0.009971419349312782, "timestamp": "2025-09-30 22:13:22.482487", "step": 2415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.538731", "step": 2415, "epoch": 3 }, { "type": "loss", "content": 0.006832667160779238, "timestamp": "2025-09-30 22:13:22.545680", "step": 2416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:22.601841", "step": 2416, "epoch": 3 }, { "type": "loss", "content": 0.012653900310397148, "timestamp": "2025-09-30 22:13:22.604658", "step": 2417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:22.663734", "step": 2417, "epoch": 3 }, { "type": "loss", "content": 0.009062431752681732, "timestamp": "2025-09-30 22:13:22.667671", "step": 2418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.732330", "step": 2418, "epoch": 3 }, { "type": "loss", "content": 0.0064752777107059956, "timestamp": "2025-09-30 22:13:22.737635", "step": 2419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.798336", "step": 2419, "epoch": 3 }, { "type": "loss", "content": 0.003598024370148778, "timestamp": "2025-09-30 22:13:22.806812", "step": 2420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.865289", "step": 2420, "epoch": 3 }, { "type": "loss", "content": 0.002874630270525813, "timestamp": "2025-09-30 22:13:22.869382", "step": 2421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:22.925654", "step": 2421, "epoch": 3 }, { "type": "loss", "content": 0.03282652422785759, "timestamp": "2025-09-30 22:13:22.929496", "step": 2422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:22.984177", "step": 2422, "epoch": 3 }, { "type": "loss", "content": 0.013241315260529518, "timestamp": "2025-09-30 22:13:22.987735", "step": 2423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.051284", "step": 2423, "epoch": 3 }, { "type": "loss", "content": 0.014345109462738037, "timestamp": "2025-09-30 22:13:23.059332", "step": 2424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 3520021436096.0 }, "timestamp": "2025-09-30 22:13:23.121038", "step": 2424, "epoch": 3 }, { "type": "loss", "content": 0.015767786651849747, "timestamp": "2025-09-30 22:13:23.123336", "step": 2425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.184027", "step": 2425, "epoch": 3 }, { "type": "loss", "content": 0.004914857912808657, "timestamp": "2025-09-30 22:13:23.190312", "step": 2426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.247883", "step": 2426, "epoch": 3 }, { "type": "loss", "content": 0.021198097616434097, "timestamp": "2025-09-30 22:13:23.252020", "step": 2427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:23.314039", "step": 2427, "epoch": 3 }, { "type": "loss", "content": 0.000634256808552891, "timestamp": "2025-09-30 22:13:23.320233", "step": 2428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.373727", "step": 2428, "epoch": 3 }, { "type": "loss", "content": 0.010591315105557442, "timestamp": "2025-09-30 22:13:23.376300", "step": 2429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.434026", "step": 2429, "epoch": 3 }, { "type": "loss", "content": 0.019355354830622673, "timestamp": "2025-09-30 22:13:23.443163", "step": 2430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:23.505531", "step": 2430, "epoch": 3 }, { "type": "loss", "content": 0.01241142489016056, "timestamp": "2025-09-30 22:13:23.511100", "step": 2431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.567662", "step": 2431, "epoch": 3 }, { "type": "loss", "content": 0.009565332904458046, "timestamp": "2025-09-30 22:13:23.575405", "step": 2432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:23.632837", "step": 2432, "epoch": 3 }, { "type": "loss", "content": 0.025015641003847122, "timestamp": "2025-09-30 22:13:23.635530", "step": 2433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.697422", "step": 2433, "epoch": 3 }, { "type": "loss", "content": 0.006486161146312952, "timestamp": "2025-09-30 22:13:23.709455", "step": 2434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.771333", "step": 2434, "epoch": 3 }, { "type": "loss", "content": 0.011576437391340733, "timestamp": "2025-09-30 22:13:23.776736", "step": 2435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.837294", "step": 2435, "epoch": 3 }, { "type": "loss", "content": 0.015482393093407154, "timestamp": "2025-09-30 22:13:23.844023", "step": 2436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.899133", "step": 2436, "epoch": 3 }, { "type": "loss", "content": 0.003182504326105118, "timestamp": "2025-09-30 22:13:23.904724", "step": 2437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:23.964847", "step": 2437, "epoch": 3 }, { "type": "loss", "content": 0.011627678759396076, "timestamp": "2025-09-30 22:13:23.968295", "step": 2438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.025865", "step": 2438, "epoch": 3 }, { "type": "loss", "content": 0.0038793303538113832, "timestamp": "2025-09-30 22:13:24.032278", "step": 2439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:24.096569", "step": 2439, "epoch": 3 }, { "type": "loss", "content": 0.005496421363204718, "timestamp": "2025-09-30 22:13:24.108928", "step": 2440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.169991", "step": 2440, "epoch": 3 }, { "type": "loss", "content": 0.012969347648322582, "timestamp": "2025-09-30 22:13:24.176292", "step": 2441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:24.237921", "step": 2441, "epoch": 3 }, { "type": "loss", "content": 0.005602761637419462, "timestamp": "2025-09-30 22:13:24.242046", "step": 2442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.305589", "step": 2442, "epoch": 3 }, { "type": "loss", "content": 0.00627124821767211, "timestamp": "2025-09-30 22:13:24.309441", "step": 2443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:24.369126", "step": 2443, "epoch": 3 }, { "type": "loss", "content": 0.01086896751075983, "timestamp": "2025-09-30 22:13:24.377066", "step": 2444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.430661", "step": 2444, "epoch": 3 }, { "type": "loss", "content": 0.026062259450554848, "timestamp": "2025-09-30 22:13:24.434410", "step": 2445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.496901", "step": 2445, "epoch": 3 }, { "type": "loss", "content": 0.012748646549880505, "timestamp": "2025-09-30 22:13:24.499871", "step": 2446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.560400", "step": 2446, "epoch": 3 }, { "type": "loss", "content": 0.018161652609705925, "timestamp": "2025-09-30 22:13:24.567688", "step": 2447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.621315", "step": 2447, "epoch": 3 }, { "type": "loss", "content": 0.006955179385840893, "timestamp": "2025-09-30 22:13:24.627375", "step": 2448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.684306", "step": 2448, "epoch": 3 }, { "type": "loss", "content": 0.011628582142293453, "timestamp": "2025-09-30 22:13:24.688144", "step": 2449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:24.748192", "step": 2449, "epoch": 3 }, { "type": "loss", "content": 0.00847693346440792, "timestamp": "2025-09-30 22:13:24.751297", "step": 2450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:24.811240", "step": 2450, "epoch": 3 }, { "type": "loss", "content": 0.017132606357336044, "timestamp": "2025-09-30 22:13:24.814566", "step": 2451, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:26.264249", "step": 2451, "epoch": 3 }, { "type": "pplx", "content": 28315239.44436738, "timestamp": "2025-09-30 22:13:26.268190", "step": 2451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.323933", "step": 2451, "epoch": 3 }, { "type": "loss", "content": 0.03608303517103195, "timestamp": "2025-09-30 22:13:26.329858", "step": 2452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.390834", "step": 2452, "epoch": 3 }, { "type": "loss", "content": 0.005534134339541197, "timestamp": "2025-09-30 22:13:26.397219", "step": 2453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.465956", "step": 2453, "epoch": 3 }, { "type": "loss", "content": 0.003311963053420186, "timestamp": "2025-09-30 22:13:26.470711", "step": 2454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:26.534839", "step": 2454, "epoch": 3 }, { "type": "loss", "content": 0.03324703872203827, "timestamp": "2025-09-30 22:13:26.539042", "step": 2455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:26.598664", "step": 2455, "epoch": 3 }, { "type": "loss", "content": 0.0027091566007584333, "timestamp": "2025-09-30 22:13:26.606372", "step": 2456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.666515", "step": 2456, "epoch": 3 }, { "type": "loss", "content": 0.012049831449985504, "timestamp": "2025-09-30 22:13:26.669406", "step": 2457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.725248", "step": 2457, "epoch": 3 }, { "type": "loss", "content": 0.007934476248919964, "timestamp": "2025-09-30 22:13:26.728122", "step": 2458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.784509", "step": 2458, "epoch": 3 }, { "type": "loss", "content": 0.0067762285470962524, "timestamp": "2025-09-30 22:13:26.787065", "step": 2459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.846731", "step": 2459, "epoch": 3 }, { "type": "loss", "content": 0.0006125275394879282, "timestamp": "2025-09-30 22:13:26.852534", "step": 2460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.909867", "step": 2460, "epoch": 3 }, { "type": "loss", "content": 0.001969063188880682, "timestamp": "2025-09-30 22:13:26.919777", "step": 2461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:26.982301", "step": 2461, "epoch": 3 }, { "type": "loss", "content": 0.01936742290854454, "timestamp": "2025-09-30 22:13:26.985867", "step": 2462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.056042", "step": 2462, "epoch": 3 }, { "type": "loss", "content": 0.02352396585047245, "timestamp": "2025-09-30 22:13:27.062482", "step": 2463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.118380", "step": 2463, "epoch": 3 }, { "type": "loss", "content": 0.021059229969978333, "timestamp": "2025-09-30 22:13:27.124270", "step": 2464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.185631", "step": 2464, "epoch": 3 }, { "type": "loss", "content": 0.0014644553884863853, "timestamp": "2025-09-30 22:13:27.191249", "step": 2465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:27.261864", "step": 2465, "epoch": 3 }, { "type": "loss", "content": 0.002741077449172735, "timestamp": "2025-09-30 22:13:27.264795", "step": 2466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.320924", "step": 2466, "epoch": 3 }, { "type": "loss", "content": 0.003315993817523122, "timestamp": "2025-09-30 22:13:27.324832", "step": 2467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.380192", "step": 2467, "epoch": 3 }, { "type": "loss", "content": 0.016506491228938103, "timestamp": "2025-09-30 22:13:27.387391", "step": 2468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:27.450382", "step": 2468, "epoch": 3 }, { "type": "loss", "content": 0.005784686654806137, "timestamp": "2025-09-30 22:13:27.453645", "step": 2469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.515525", "step": 2469, "epoch": 3 }, { "type": "loss", "content": 0.024664176627993584, "timestamp": "2025-09-30 22:13:27.518250", "step": 2470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:27.572542", "step": 2470, "epoch": 3 }, { "type": "loss", "content": 0.00011582062143133953, "timestamp": "2025-09-30 22:13:27.578850", "step": 2471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.637005", "step": 2471, "epoch": 3 }, { "type": "loss", "content": 0.007763525936752558, "timestamp": "2025-09-30 22:13:27.646004", "step": 2472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.702172", "step": 2472, "epoch": 3 }, { "type": "loss", "content": 0.010659711435437202, "timestamp": "2025-09-30 22:13:27.706063", "step": 2473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.777540", "step": 2473, "epoch": 3 }, { "type": "loss", "content": 0.0018466237233951688, "timestamp": "2025-09-30 22:13:27.779959", "step": 2474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.852539", "step": 2474, "epoch": 3 }, { "type": "loss", "content": 0.0035447957925498486, "timestamp": "2025-09-30 22:13:27.859813", "step": 2475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:27.920238", "step": 2475, "epoch": 3 }, { "type": "loss", "content": 0.005354622844606638, "timestamp": "2025-09-30 22:13:27.926963", "step": 2476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:27.989674", "step": 2476, "epoch": 3 }, { "type": "loss", "content": 0.0005044917925260961, "timestamp": "2025-09-30 22:13:27.992747", "step": 2477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:28.048142", "step": 2477, "epoch": 3 }, { "type": "loss", "content": 0.02057606168091297, "timestamp": "2025-09-30 22:13:28.051060", "step": 2478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.108531", "step": 2478, "epoch": 3 }, { "type": "loss", "content": 0.00041592912748456, "timestamp": "2025-09-30 22:13:28.110908", "step": 2479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.173240", "step": 2479, "epoch": 3 }, { "type": "loss", "content": 0.02707737870514393, "timestamp": "2025-09-30 22:13:28.179409", "step": 2480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:28.234239", "step": 2480, "epoch": 3 }, { "type": "loss", "content": 0.008141888305544853, "timestamp": "2025-09-30 22:13:28.237098", "step": 2481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:28.294134", "step": 2481, "epoch": 3 }, { "type": "loss", "content": 0.0005826229462400079, "timestamp": "2025-09-30 22:13:28.297607", "step": 2482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:28.354318", "step": 2482, "epoch": 3 }, { "type": "loss", "content": 0.0010274297092109919, "timestamp": "2025-09-30 22:13:28.358929", "step": 2483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:28.416658", "step": 2483, "epoch": 3 }, { "type": "loss", "content": 0.00012387447350192815, "timestamp": "2025-09-30 22:13:28.424114", "step": 2484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.486895", "step": 2484, "epoch": 3 }, { "type": "loss", "content": 0.0458819679915905, "timestamp": "2025-09-30 22:13:28.489480", "step": 2485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.545121", "step": 2485, "epoch": 3 }, { "type": "loss", "content": 0.0024083128664642572, "timestamp": "2025-09-30 22:13:28.549784", "step": 2486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.612063", "step": 2486, "epoch": 3 }, { "type": "loss", "content": 0.018058612942695618, "timestamp": "2025-09-30 22:13:28.615950", "step": 2487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.677326", "step": 2487, "epoch": 3 }, { "type": "loss", "content": 0.016298463568091393, "timestamp": "2025-09-30 22:13:28.683260", "step": 2488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:28.739919", "step": 2488, "epoch": 3 }, { "type": "loss", "content": 0.007573925890028477, "timestamp": "2025-09-30 22:13:28.742847", "step": 2489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.802419", "step": 2489, "epoch": 3 }, { "type": "loss", "content": 0.00014498885138891637, "timestamp": "2025-09-30 22:13:28.807170", "step": 2490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:28.872060", "step": 2490, "epoch": 3 }, { "type": "loss", "content": 6.842377479188144e-05, "timestamp": "2025-09-30 22:13:28.879991", "step": 2491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:28.951699", "step": 2491, "epoch": 3 }, { "type": "loss", "content": 0.027848348021507263, "timestamp": "2025-09-30 22:13:28.958119", "step": 2492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:29.023111", "step": 2492, "epoch": 3 }, { "type": "loss", "content": 0.0003194676246494055, "timestamp": "2025-09-30 22:13:29.026405", "step": 2493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.085635", "step": 2493, "epoch": 3 }, { "type": "loss", "content": 0.004121209029108286, "timestamp": "2025-09-30 22:13:29.091479", "step": 2494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.151939", "step": 2494, "epoch": 3 }, { "type": "loss", "content": 0.030184591189026833, "timestamp": "2025-09-30 22:13:29.154364", "step": 2495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.213774", "step": 2495, "epoch": 3 }, { "type": "loss", "content": 0.00010497510811546817, "timestamp": "2025-09-30 22:13:29.225978", "step": 2496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.288120", "step": 2496, "epoch": 3 }, { "type": "loss", "content": 0.006152989808470011, "timestamp": "2025-09-30 22:13:29.298523", "step": 2497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:29.366801", "step": 2497, "epoch": 3 }, { "type": "loss", "content": 0.010256282053887844, "timestamp": "2025-09-30 22:13:29.378777", "step": 2498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.443430", "step": 2498, "epoch": 3 }, { "type": "loss", "content": 0.0030539052095264196, "timestamp": "2025-09-30 22:13:29.445695", "step": 2499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:29.502248", "step": 2499, "epoch": 3 }, { "type": "loss", "content": 0.005430816672742367, "timestamp": "2025-09-30 22:13:29.512673", "step": 2500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2500", "timestamp": "2025-09-30 22:13:29.992354", "step": 2500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.056711", "step": 2500, "epoch": 3 }, { "type": "loss", "content": 0.0053467415273189545, "timestamp": "2025-09-30 22:13:30.068444", "step": 2501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.125796", "step": 2501, "epoch": 3 }, { "type": "loss", "content": 0.011339363642036915, "timestamp": "2025-09-30 22:13:30.131339", "step": 2502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.192817", "step": 2502, "epoch": 3 }, { "type": "loss", "content": 0.010253089480102062, "timestamp": "2025-09-30 22:13:30.202343", "step": 2503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:30.266346", "step": 2503, "epoch": 3 }, { "type": "loss", "content": 0.04441189020872116, "timestamp": "2025-09-30 22:13:30.284633", "step": 2504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.343260", "step": 2504, "epoch": 3 }, { "type": "loss", "content": 0.0009698430076241493, "timestamp": "2025-09-30 22:13:30.346379", "step": 2505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.415313", "step": 2505, "epoch": 3 }, { "type": "loss", "content": 0.008782203309237957, "timestamp": "2025-09-30 22:13:30.421399", "step": 2506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.488357", "step": 2506, "epoch": 3 }, { "type": "loss", "content": 0.0034844086039811373, "timestamp": "2025-09-30 22:13:30.495408", "step": 2507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:30.561594", "step": 2507, "epoch": 3 }, { "type": "loss", "content": 0.02779925987124443, "timestamp": "2025-09-30 22:13:30.570543", "step": 2508, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:32.146436", "step": 2508, "epoch": 3 }, { "type": "pplx", "content": 28484982.005562108, "timestamp": "2025-09-30 22:13:32.149120", "step": 2508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.201869", "step": 2508, "epoch": 3 }, { "type": "loss", "content": 0.04372706264257431, "timestamp": "2025-09-30 22:13:32.204873", "step": 2509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.277804", "step": 2509, "epoch": 3 }, { "type": "loss", "content": 0.006275986321270466, "timestamp": "2025-09-30 22:13:32.294723", "step": 2510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.372737", "step": 2510, "epoch": 3 }, { "type": "loss", "content": 0.007544786669313908, "timestamp": "2025-09-30 22:13:32.395531", "step": 2511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:32.472031", "step": 2511, "epoch": 3 }, { "type": "loss", "content": 0.030385946854948997, "timestamp": "2025-09-30 22:13:32.497664", "step": 2512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.576892", "step": 2512, "epoch": 3 }, { "type": "loss", "content": 0.02495487593114376, "timestamp": "2025-09-30 22:13:32.608326", "step": 2513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.686036", "step": 2513, "epoch": 3 }, { "type": "loss", "content": 0.005949577782303095, "timestamp": "2025-09-30 22:13:32.709754", "step": 2514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:32.779864", "step": 2514, "epoch": 3 }, { "type": "loss", "content": 0.016375141218304634, "timestamp": "2025-09-30 22:13:32.800113", "step": 2515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:32.881676", "step": 2515, "epoch": 3 }, { "type": "loss", "content": 0.03323351964354515, "timestamp": "2025-09-30 22:13:32.895106", "step": 2516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:32.968779", "step": 2516, "epoch": 3 }, { "type": "loss", "content": 0.010078134015202522, "timestamp": "2025-09-30 22:13:32.983705", "step": 2517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.056435", "step": 2517, "epoch": 3 }, { "type": "loss", "content": 0.005874228663742542, "timestamp": "2025-09-30 22:13:33.068804", "step": 2518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.139380", "step": 2518, "epoch": 3 }, { "type": "loss", "content": 0.026764482259750366, "timestamp": "2025-09-30 22:13:33.150956", "step": 2519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:33.215944", "step": 2519, "epoch": 3 }, { "type": "loss", "content": 0.007020972203463316, "timestamp": "2025-09-30 22:13:33.230128", "step": 2520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.306129", "step": 2520, "epoch": 3 }, { "type": "loss", "content": 0.002482133684679866, "timestamp": "2025-09-30 22:13:33.321965", "step": 2521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:33.407397", "step": 2521, "epoch": 3 }, { "type": "loss", "content": 0.0031678727827966213, "timestamp": "2025-09-30 22:13:33.422133", "step": 2522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:33.491337", "step": 2522, "epoch": 3 }, { "type": "loss", "content": 0.03531847894191742, "timestamp": "2025-09-30 22:13:33.512412", "step": 2523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.578179", "step": 2523, "epoch": 3 }, { "type": "loss", "content": 0.012394100427627563, "timestamp": "2025-09-30 22:13:33.600136", "step": 2524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.666404", "step": 2524, "epoch": 3 }, { "type": "loss", "content": 0.011437847279012203, "timestamp": "2025-09-30 22:13:33.682021", "step": 2525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:33.771138", "step": 2525, "epoch": 3 }, { "type": "loss", "content": 0.002483924152329564, "timestamp": "2025-09-30 22:13:33.791110", "step": 2526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:33.863813", "step": 2526, "epoch": 3 }, { "type": "loss", "content": 0.01781102456152439, "timestamp": "2025-09-30 22:13:33.879117", "step": 2527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:33.948914", "step": 2527, "epoch": 3 }, { "type": "loss", "content": 0.02498701587319374, "timestamp": "2025-09-30 22:13:33.974229", "step": 2528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.047825", "step": 2528, "epoch": 3 }, { "type": "loss", "content": 0.004967492539435625, "timestamp": "2025-09-30 22:13:34.057256", "step": 2529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:34.120092", "step": 2529, "epoch": 3 }, { "type": "loss", "content": 0.008416562341153622, "timestamp": "2025-09-30 22:13:34.124286", "step": 2530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.180050", "step": 2530, "epoch": 3 }, { "type": "loss", "content": 0.004394114948809147, "timestamp": "2025-09-30 22:13:34.188245", "step": 2531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.255255", "step": 2531, "epoch": 3 }, { "type": "loss", "content": 0.0030704778619110584, "timestamp": "2025-09-30 22:13:34.262524", "step": 2532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.322691", "step": 2532, "epoch": 3 }, { "type": "loss", "content": 0.021440336480736732, "timestamp": "2025-09-30 22:13:34.325981", "step": 2533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.381080", "step": 2533, "epoch": 3 }, { "type": "loss", "content": 0.013712027110159397, "timestamp": "2025-09-30 22:13:34.389556", "step": 2534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:34.449231", "step": 2534, "epoch": 3 }, { "type": "loss", "content": 0.006967604160308838, "timestamp": "2025-09-30 22:13:34.457892", "step": 2535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:34.518202", "step": 2535, "epoch": 3 }, { "type": "loss", "content": 0.003231785027310252, "timestamp": "2025-09-30 22:13:34.529400", "step": 2536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:34.584782", "step": 2536, "epoch": 3 }, { "type": "loss", "content": 0.018372103571891785, "timestamp": "2025-09-30 22:13:34.594720", "step": 2537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.652122", "step": 2537, "epoch": 3 }, { "type": "loss", "content": 0.005035504698753357, "timestamp": "2025-09-30 22:13:34.660471", "step": 2538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:34.717692", "step": 2538, "epoch": 3 }, { "type": "loss", "content": 0.008455309085547924, "timestamp": "2025-09-30 22:13:34.720813", "step": 2539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:34.778433", "step": 2539, "epoch": 3 }, { "type": "loss", "content": 0.000451569736469537, "timestamp": "2025-09-30 22:13:34.785623", "step": 2540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:34.857888", "step": 2540, "epoch": 3 }, { "type": "loss", "content": 0.0005161632434464991, "timestamp": "2025-09-30 22:13:34.861478", "step": 2541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:34.929400", "step": 2541, "epoch": 3 }, { "type": "loss", "content": 0.004660214763134718, "timestamp": "2025-09-30 22:13:34.931747", "step": 2542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.002296", "step": 2542, "epoch": 3 }, { "type": "loss", "content": 0.005621172953397036, "timestamp": "2025-09-30 22:13:35.004815", "step": 2543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.062321", "step": 2543, "epoch": 3 }, { "type": "loss", "content": 0.02957882173359394, "timestamp": "2025-09-30 22:13:35.069169", "step": 2544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:35.140995", "step": 2544, "epoch": 3 }, { "type": "loss", "content": 0.009165803901851177, "timestamp": "2025-09-30 22:13:35.143954", "step": 2545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.201122", "step": 2545, "epoch": 3 }, { "type": "loss", "content": 0.02128801867365837, "timestamp": "2025-09-30 22:13:35.208203", "step": 2546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.268216", "step": 2546, "epoch": 3 }, { "type": "loss", "content": 0.005340006668120623, "timestamp": "2025-09-30 22:13:35.276293", "step": 2547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.340625", "step": 2547, "epoch": 3 }, { "type": "loss", "content": 0.007607621140778065, "timestamp": "2025-09-30 22:13:35.348466", "step": 2548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:35.415112", "step": 2548, "epoch": 3 }, { "type": "loss", "content": 0.021063433960080147, "timestamp": "2025-09-30 22:13:35.418212", "step": 2549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.482643", "step": 2549, "epoch": 3 }, { "type": "loss", "content": 0.011848625726997852, "timestamp": "2025-09-30 22:13:35.486294", "step": 2550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.553913", "step": 2550, "epoch": 3 }, { "type": "loss", "content": 0.010470031760632992, "timestamp": "2025-09-30 22:13:35.557896", "step": 2551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.614322", "step": 2551, "epoch": 3 }, { "type": "loss", "content": 0.007319572381675243, "timestamp": "2025-09-30 22:13:35.625177", "step": 2552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.683824", "step": 2552, "epoch": 3 }, { "type": "loss", "content": 0.002631863346323371, "timestamp": "2025-09-30 22:13:35.686736", "step": 2553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:35.743155", "step": 2553, "epoch": 3 }, { "type": "loss", "content": 0.026114294305443764, "timestamp": "2025-09-30 22:13:35.745574", "step": 2554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.799953", "step": 2554, "epoch": 3 }, { "type": "loss", "content": 0.0004028195107821375, "timestamp": "2025-09-30 22:13:35.802048", "step": 2555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.861107", "step": 2555, "epoch": 3 }, { "type": "loss", "content": 0.0005155097460374236, "timestamp": "2025-09-30 22:13:35.868717", "step": 2556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.931861", "step": 2556, "epoch": 3 }, { "type": "loss", "content": 0.014193429611623287, "timestamp": "2025-09-30 22:13:35.935249", "step": 2557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:35.991284", "step": 2557, "epoch": 3 }, { "type": "loss", "content": 0.023700231686234474, "timestamp": "2025-09-30 22:13:35.997758", "step": 2558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:36.057147", "step": 2558, "epoch": 3 }, { "type": "loss", "content": 0.013391217216849327, "timestamp": "2025-09-30 22:13:36.060664", "step": 2559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:36.126611", "step": 2559, "epoch": 3 }, { "type": "loss", "content": 0.0035434234887361526, "timestamp": "2025-09-30 22:13:36.132365", "step": 2560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:36.191606", "step": 2560, "epoch": 3 }, { "type": "loss", "content": 0.02454542927443981, "timestamp": "2025-09-30 22:13:36.199936", "step": 2561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:36.279053", "step": 2561, "epoch": 3 }, { "type": "loss", "content": 0.0029066719580441713, "timestamp": "2025-09-30 22:13:36.287710", "step": 2562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:36.365339", "step": 2562, "epoch": 3 }, { "type": "loss", "content": 0.006104097701609135, "timestamp": "2025-09-30 22:13:36.368124", "step": 2563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:36.428303", "step": 2563, "epoch": 3 }, { "type": "loss", "content": 0.010215085931122303, "timestamp": "2025-09-30 22:13:36.437744", "step": 2564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:36.503906", "step": 2564, "epoch": 3 }, { "type": "loss", "content": 0.005062797572463751, "timestamp": "2025-09-30 22:13:36.509952", "step": 2565, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:37.982199", "step": 2565, "epoch": 3 }, { "type": "pplx", "content": 28095844.67479086, "timestamp": "2025-09-30 22:13:37.984464", "step": 2565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.037632", "step": 2565, "epoch": 3 }, { "type": "loss", "content": 0.010721017606556416, "timestamp": "2025-09-30 22:13:38.039899", "step": 2566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:38.108144", "step": 2566, "epoch": 3 }, { "type": "loss", "content": 0.003516018856316805, "timestamp": "2025-09-30 22:13:38.110577", "step": 2567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:38.167050", "step": 2567, "epoch": 3 }, { "type": "loss", "content": 0.0029169064946472645, "timestamp": "2025-09-30 22:13:38.174014", "step": 2568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:38.237774", "step": 2568, "epoch": 3 }, { "type": "loss", "content": 0.0035148721653968096, "timestamp": "2025-09-30 22:13:38.240537", "step": 2569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.296057", "step": 2569, "epoch": 3 }, { "type": "loss", "content": 0.033282749354839325, "timestamp": "2025-09-30 22:13:38.300103", "step": 2570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.361289", "step": 2570, "epoch": 3 }, { "type": "loss", "content": 0.010799924843013287, "timestamp": "2025-09-30 22:13:38.363894", "step": 2571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.428150", "step": 2571, "epoch": 3 }, { "type": "loss", "content": 0.005024234298616648, "timestamp": "2025-09-30 22:13:38.434615", "step": 2572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.494982", "step": 2572, "epoch": 3 }, { "type": "loss", "content": 0.0024888780899345875, "timestamp": "2025-09-30 22:13:38.497928", "step": 2573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.564411", "step": 2573, "epoch": 3 }, { "type": "loss", "content": 0.014496888034045696, "timestamp": "2025-09-30 22:13:38.573061", "step": 2574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:38.629771", "step": 2574, "epoch": 3 }, { "type": "loss", "content": 0.006988645065575838, "timestamp": "2025-09-30 22:13:38.637345", "step": 2575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:38.697004", "step": 2575, "epoch": 3 }, { "type": "loss", "content": 0.01132346410304308, "timestamp": "2025-09-30 22:13:38.703263", "step": 2576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.764133", "step": 2576, "epoch": 3 }, { "type": "loss", "content": 0.0035836149472743273, "timestamp": "2025-09-30 22:13:38.767094", "step": 2577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:38.830451", "step": 2577, "epoch": 3 }, { "type": "loss", "content": 0.013794556260108948, "timestamp": "2025-09-30 22:13:38.833653", "step": 2578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:38.892206", "step": 2578, "epoch": 3 }, { "type": "loss", "content": 0.004456694237887859, "timestamp": "2025-09-30 22:13:38.895467", "step": 2579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:38.958609", "step": 2579, "epoch": 3 }, { "type": "loss", "content": 0.006100859493017197, "timestamp": "2025-09-30 22:13:38.965114", "step": 2580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.029425", "step": 2580, "epoch": 3 }, { "type": "loss", "content": 0.0033564306795597076, "timestamp": "2025-09-30 22:13:39.031433", "step": 2581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.088665", "step": 2581, "epoch": 3 }, { "type": "loss", "content": 0.0037206211127340794, "timestamp": "2025-09-30 22:13:39.095796", "step": 2582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.153293", "step": 2582, "epoch": 3 }, { "type": "loss", "content": 0.006556731648743153, "timestamp": "2025-09-30 22:13:39.155960", "step": 2583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.210529", "step": 2583, "epoch": 3 }, { "type": "loss", "content": 0.01759198307991028, "timestamp": "2025-09-30 22:13:39.216788", "step": 2584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.271308", "step": 2584, "epoch": 3 }, { "type": "loss", "content": 0.012082146480679512, "timestamp": "2025-09-30 22:13:39.273747", "step": 2585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.328828", "step": 2585, "epoch": 3 }, { "type": "loss", "content": 0.034108925610780716, "timestamp": "2025-09-30 22:13:39.331233", "step": 2586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.389845", "step": 2586, "epoch": 3 }, { "type": "loss", "content": 0.003907191567122936, "timestamp": "2025-09-30 22:13:39.403544", "step": 2587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:39.465603", "step": 2587, "epoch": 3 }, { "type": "loss", "content": 0.009326784871518612, "timestamp": "2025-09-30 22:13:39.472112", "step": 2588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:39.531850", "step": 2588, "epoch": 3 }, { "type": "loss", "content": 0.008430427871644497, "timestamp": "2025-09-30 22:13:39.535223", "step": 2589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.599643", "step": 2589, "epoch": 3 }, { "type": "loss", "content": 0.005537763237953186, "timestamp": "2025-09-30 22:13:39.602815", "step": 2590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.659746", "step": 2590, "epoch": 3 }, { "type": "loss", "content": 0.00847475416958332, "timestamp": "2025-09-30 22:13:39.664121", "step": 2591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.726143", "step": 2591, "epoch": 3 }, { "type": "loss", "content": 0.0008981380960904062, "timestamp": "2025-09-30 22:13:39.736910", "step": 2592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:39.801757", "step": 2592, "epoch": 3 }, { "type": "loss", "content": 0.0013419969473034143, "timestamp": "2025-09-30 22:13:39.805646", "step": 2593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.860221", "step": 2593, "epoch": 3 }, { "type": "loss", "content": 0.0038829329423606396, "timestamp": "2025-09-30 22:13:39.870646", "step": 2594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:39.925607", "step": 2594, "epoch": 3 }, { "type": "loss", "content": 0.0077403089962899685, "timestamp": "2025-09-30 22:13:39.928827", "step": 2595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-30 22:13:39.984471", "step": 2595, "epoch": 3 }, { "type": "loss", "content": 0.013570351526141167, "timestamp": "2025-09-30 22:13:39.990851", "step": 2596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.054183", "step": 2596, "epoch": 3 }, { "type": "loss", "content": 0.005910519044846296, "timestamp": "2025-09-30 22:13:40.057403", "step": 2597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.115868", "step": 2597, "epoch": 3 }, { "type": "loss", "content": 0.0021806147415190935, "timestamp": "2025-09-30 22:13:40.119422", "step": 2598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.176335", "step": 2598, "epoch": 3 }, { "type": "loss", "content": 0.002977479714900255, "timestamp": "2025-09-30 22:13:40.184658", "step": 2599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.248724", "step": 2599, "epoch": 3 }, { "type": "loss", "content": 0.0011497695231810212, "timestamp": "2025-09-30 22:13:40.259275", "step": 2600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.316010", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.0021200214978307486, "timestamp": "2025-09-30 22:13:40.322723", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.384991", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.003689644392579794, "timestamp": "2025-09-30 22:13:40.388971", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.451268", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.002164714504033327, "timestamp": "2025-09-30 22:13:40.454859", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.516437", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.024041039869189262, "timestamp": "2025-09-30 22:13:40.523219", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:40.581524", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.053784094750881195, "timestamp": "2025-09-30 22:13:40.587466", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:40.643075", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.00025220931274816394, "timestamp": "2025-09-30 22:13:40.648320", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.706291", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.0412961021065712, "timestamp": "2025-09-30 22:13:40.709924", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:40.768686", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.0008691218099556863, "timestamp": "2025-09-30 22:13:40.775025", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:40.829933", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.012250705622136593, "timestamp": "2025-09-30 22:13:40.840134", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:40.896800", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.007466053124517202, "timestamp": "2025-09-30 22:13:40.899879", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:40.962887", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.0008426569984294474, "timestamp": "2025-09-30 22:13:40.975062", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.033291", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.001527618383988738, "timestamp": "2025-09-30 22:13:41.040007", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:41.094083", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.05261802300810814, "timestamp": "2025-09-30 22:13:41.098284", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.154590", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.00033618827001191676, "timestamp": "2025-09-30 22:13:41.159766", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.219906", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.013994032517075539, "timestamp": "2025-09-30 22:13:41.222645", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:41.277574", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.001805710606276989, "timestamp": "2025-09-30 22:13:41.283924", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.338699", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.01090067345649004, "timestamp": "2025-09-30 22:13:41.341341", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:41.395509", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.010763258673250675, "timestamp": "2025-09-30 22:13:41.398233", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.456411", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.00023230792430695146, "timestamp": "2025-09-30 22:13:41.459052", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:41.517705", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 0.00010520854266360402, "timestamp": "2025-09-30 22:13:41.526262", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.592677", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 8.937703387346119e-05, "timestamp": "2025-09-30 22:13:41.596741", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:41.651606", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.003324234625324607, "timestamp": "2025-09-30 22:13:41.656269", "step": 2622, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:42.991390", "step": 2622, "epoch": 3 }, { "type": "pplx", "content": 31437886.455709014, "timestamp": "2025-09-30 22:13:42.996628", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.052565", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.005293331108987331, "timestamp": "2025-09-30 22:13:43.059695", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:43.124834", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 9.892590605886653e-05, "timestamp": "2025-09-30 22:13:43.132164", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.192440", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.033769041299819946, "timestamp": "2025-09-30 22:13:43.201469", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.266274", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.036175686866045, "timestamp": "2025-09-30 22:13:43.269492", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.334734", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.0016300681745633483, "timestamp": "2025-09-30 22:13:43.349674", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.405168", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.052250683307647705, "timestamp": "2025-09-30 22:13:43.412369", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.477284", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.003264912636950612, "timestamp": "2025-09-30 22:13:43.480096", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.539111", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.0006807534955441952, "timestamp": "2025-09-30 22:13:43.544956", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.601818", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.0006088690715841949, "timestamp": "2025-09-30 22:13:43.610272", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:43.673117", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.006588431540876627, "timestamp": "2025-09-30 22:13:43.691680", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.747030", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.0006200451171025634, "timestamp": "2025-09-30 22:13:43.750018", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.811484", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.01981678046286106, "timestamp": "2025-09-30 22:13:43.814840", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:43.873252", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.003335049608722329, "timestamp": "2025-09-30 22:13:43.875994", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:43.939080", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.02097000740468502, "timestamp": "2025-09-30 22:13:43.951340", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.008281", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.007635528687387705, "timestamp": "2025-09-30 22:13:44.014140", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.072624", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.0021020916756242514, "timestamp": "2025-09-30 22:13:44.078675", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.142189", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.00463180523365736, "timestamp": "2025-09-30 22:13:44.145158", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.207719", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.00478312699124217, "timestamp": "2025-09-30 22:13:44.215952", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.272292", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.0008612548117525876, "timestamp": "2025-09-30 22:13:44.276660", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:44.344905", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.0024010143242776394, "timestamp": "2025-09-30 22:13:44.357248", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:44.423670", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.003124420763924718, "timestamp": "2025-09-30 22:13:44.427994", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:44.495267", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.013605816289782524, "timestamp": "2025-09-30 22:13:44.502640", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.560412", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.0009547712397761643, "timestamp": "2025-09-30 22:13:44.572377", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.628288", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.0024611575063318014, "timestamp": "2025-09-30 22:13:44.632917", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.693593", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.010525861755013466, "timestamp": "2025-09-30 22:13:44.701639", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:44.761423", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.014524830505251884, "timestamp": "2025-09-30 22:13:44.768375", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:44.830634", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.0008129668422043324, "timestamp": "2025-09-30 22:13:44.836483", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:44.893886", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.004618306644260883, "timestamp": "2025-09-30 22:13:44.896386", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:44.953263", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.0023160020355135202, "timestamp": "2025-09-30 22:13:44.955676", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.012135", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.0021287633571773767, "timestamp": "2025-09-30 22:13:45.020394", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.075628", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.015038742683827877, "timestamp": "2025-09-30 22:13:45.082391", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.137061", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.0270217452198267, "timestamp": "2025-09-30 22:13:45.142322", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:45.202368", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.05145685747265816, "timestamp": "2025-09-30 22:13:45.206286", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:45.265867", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.008141874335706234, "timestamp": "2025-09-30 22:13:45.272859", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:45.327084", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.002790646394714713, "timestamp": "2025-09-30 22:13:45.335543", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:45.394029", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.008883380331099033, "timestamp": "2025-09-30 22:13:45.396873", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:45.459598", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.006654282100498676, "timestamp": "2025-09-30 22:13:45.466734", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.533117", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.01170284952968359, "timestamp": "2025-09-30 22:13:45.539352", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:45.593881", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.007521068211644888, "timestamp": "2025-09-30 22:13:45.597392", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:45.658090", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.007094702683389187, "timestamp": "2025-09-30 22:13:45.661569", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.716779", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.008204026147723198, "timestamp": "2025-09-30 22:13:45.719813", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.776614", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.011364804580807686, "timestamp": "2025-09-30 22:13:45.783795", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.840029", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.015917399898171425, "timestamp": "2025-09-30 22:13:45.842921", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.905060", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.006572159472852945, "timestamp": "2025-09-30 22:13:45.914197", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:45.975171", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.0034901797771453857, "timestamp": "2025-09-30 22:13:45.977763", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:46.035916", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.014389263466000557, "timestamp": "2025-09-30 22:13:46.042903", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:46.099722", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.002424341393634677, "timestamp": "2025-09-30 22:13:46.106840", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.166856", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.01114154327660799, "timestamp": "2025-09-30 22:13:46.169241", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:46.223757", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.023684168234467506, "timestamp": "2025-09-30 22:13:46.226315", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.283561", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.002832164289429784, "timestamp": "2025-09-30 22:13:46.299391", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.362243", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.008885924704372883, "timestamp": "2025-09-30 22:13:46.367794", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:46.430281", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.0018890589708462358, "timestamp": "2025-09-30 22:13:46.433980", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.491992", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.004877452738583088, "timestamp": "2025-09-30 22:13:46.503349", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.572081", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.004309082869440317, "timestamp": "2025-09-30 22:13:46.578554", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.639190", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.009656992740929127, "timestamp": "2025-09-30 22:13:46.642116", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:46.699896", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.020586643368005753, "timestamp": "2025-09-30 22:13:46.702389", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:46.761163", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.0008080621482804418, "timestamp": "2025-09-30 22:13:46.770580", "step": 2679, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:48.223899", "step": 2679, "epoch": 3 }, { "type": "pplx", "content": 27226157.57321593, "timestamp": "2025-09-30 22:13:48.229459", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:48.285279", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.0005241225007921457, "timestamp": "2025-09-30 22:13:48.291639", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.347738", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.00306515721604228, "timestamp": "2025-09-30 22:13:48.350274", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:48.408822", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.00246833311393857, "timestamp": "2025-09-30 22:13:48.411670", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:48.465693", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.0012627762043848634, "timestamp": "2025-09-30 22:13:48.468124", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.525025", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.00040524639189243317, "timestamp": "2025-09-30 22:13:48.530951", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.596407", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 0.0009502816246822476, "timestamp": "2025-09-30 22:13:48.606005", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:48.671814", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.007616551127284765, "timestamp": "2025-09-30 22:13:48.675296", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.732914", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.005208977032452822, "timestamp": "2025-09-30 22:13:48.736853", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:48.806110", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.0008302520145662129, "timestamp": "2025-09-30 22:13:48.823721", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.883982", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 0.0001742185850162059, "timestamp": "2025-09-30 22:13:48.887570", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:48.956348", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.0018354164203628898, "timestamp": "2025-09-30 22:13:48.959330", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.020243", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.006067172158509493, "timestamp": "2025-09-30 22:13:49.023125", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:49.083643", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.023693010210990906, "timestamp": "2025-09-30 22:13:49.095187", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.161575", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.0013803176116198301, "timestamp": "2025-09-30 22:13:49.168354", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.230112", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.001813818933442235, "timestamp": "2025-09-30 22:13:49.233718", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.293224", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.0011404008837416768, "timestamp": "2025-09-30 22:13:49.302174", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:49.359140", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.007178679574280977, "timestamp": "2025-09-30 22:13:49.370459", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.429837", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.0011296061566099524, "timestamp": "2025-09-30 22:13:49.438620", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.496637", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.0011266040382906795, "timestamp": "2025-09-30 22:13:49.502242", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.561787", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.0006445986800827086, "timestamp": "2025-09-30 22:13:49.567761", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:49.624521", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.0005764567176811397, "timestamp": "2025-09-30 22:13:49.630654", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.696603", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.0010664776200428605, "timestamp": "2025-09-30 22:13:49.700446", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.756793", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 0.00033894533407874405, "timestamp": "2025-09-30 22:13:49.759941", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:49.820434", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.001302658929489553, "timestamp": "2025-09-30 22:13:49.824764", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.879323", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.003532660659402609, "timestamp": "2025-09-30 22:13:49.889824", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:49.945843", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.0034972601570189, "timestamp": "2025-09-30 22:13:49.951608", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.016545", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.00021062916493974626, "timestamp": "2025-09-30 22:13:50.020864", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.078562", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.000698353978805244, "timestamp": "2025-09-30 22:13:50.081409", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.144600", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.014168920926749706, "timestamp": "2025-09-30 22:13:50.154442", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.212646", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.0037853510584682226, "timestamp": "2025-09-30 22:13:50.220630", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.283443", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.010242769494652748, "timestamp": "2025-09-30 22:13:50.290377", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:50.344434", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 0.0002763148513622582, "timestamp": "2025-09-30 22:13:50.347449", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.411418", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 0.0003090985701419413, "timestamp": "2025-09-30 22:13:50.428813", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.489893", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.003153016325086355, "timestamp": "2025-09-30 22:13:50.493106", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:50.551946", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.014958287589251995, "timestamp": "2025-09-30 22:13:50.555908", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.626790", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.03713773563504219, "timestamp": "2025-09-30 22:13:50.630948", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.690432", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.0021324113477021456, "timestamp": "2025-09-30 22:13:50.699332", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.755210", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.010403887368738651, "timestamp": "2025-09-30 22:13:50.758629", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:50.822734", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 9.978410525945947e-05, "timestamp": "2025-09-30 22:13:50.831743", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.892991", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.0008979692356660962, "timestamp": "2025-09-30 22:13:50.896480", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:50.952988", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.012770486064255238, "timestamp": "2025-09-30 22:13:50.965793", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.027607", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.0006667285342700779, "timestamp": "2025-09-30 22:13:51.031206", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.089950", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 0.001039764960296452, "timestamp": "2025-09-30 22:13:51.092464", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.157150", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.007044041994959116, "timestamp": "2025-09-30 22:13:51.166520", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.225414", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.006314881145954132, "timestamp": "2025-09-30 22:13:51.231890", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.293636", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.012850704602897167, "timestamp": "2025-09-30 22:13:51.297398", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.352110", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.0020042883697897196, "timestamp": "2025-09-30 22:13:51.355893", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.410769", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.0015756689244881272, "timestamp": "2025-09-30 22:13:51.414151", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.472208", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.0008090141927823424, "timestamp": "2025-09-30 22:13:51.480117", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.543520", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.0029392943251878023, "timestamp": "2025-09-30 22:13:51.548126", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.613147", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.0013291776413097978, "timestamp": "2025-09-30 22:13:51.616109", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.672856", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.001892697880975902, "timestamp": "2025-09-30 22:13:51.684947", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.755930", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.0166336540132761, "timestamp": "2025-09-30 22:13:51.769304", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.826448", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.0006099882302805781, "timestamp": "2025-09-30 22:13:51.829660", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:51.889560", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.00124065310228616, "timestamp": "2025-09-30 22:13:51.892903", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:51.953941", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.008868148550391197, "timestamp": "2025-09-30 22:13:51.966422", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:52.031390", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.031378235667943954, "timestamp": "2025-09-30 22:13:52.038860", "step": 2736, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:53.450275", "step": 2736, "epoch": 3 }, { "type": "pplx", "content": 27265120.538364556, "timestamp": "2025-09-30 22:13:53.453548", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.506520", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.004425609949976206, "timestamp": "2025-09-30 22:13:53.510006", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.564344", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.00230988091789186, "timestamp": "2025-09-30 22:13:53.567973", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.629941", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.012407422065734863, "timestamp": "2025-09-30 22:13:53.632681", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.698644", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.0033368500880897045, "timestamp": "2025-09-30 22:13:53.705763", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:53.761332", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.001278606941923499, "timestamp": "2025-09-30 22:13:53.764488", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.832321", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.0054036402143538, "timestamp": "2025-09-30 22:13:53.835475", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:53.891626", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.02728707529604435, "timestamp": "2025-09-30 22:13:53.895367", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:53.954983", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.005847959313541651, "timestamp": "2025-09-30 22:13:53.961864", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:54.025521", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.006271410267800093, "timestamp": "2025-09-30 22:13:54.028753", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:54.089180", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.006004456430673599, "timestamp": "2025-09-30 22:13:54.093167", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:54.169048", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.019149092957377434, "timestamp": "2025-09-30 22:13:54.171857", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-30 22:13:54.246050", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.0032882175873965025, "timestamp": "2025-09-30 22:13:54.264454", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:54.320406", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.024987680837512016, "timestamp": "2025-09-30 22:13:54.325559", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-30 22:13:54.390676", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.043651703745126724, "timestamp": "2025-09-30 22:13:54.394730", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-30 22:13:54.466019", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.03387341648340225, "timestamp": "2025-09-30 22:13:54.468879", "step": 2751, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-30 22:13:55.951133", "step": 2751, "epoch": 3 }, { "type": "pplx", "content": 29126071.55555798, "timestamp": "2025-09-30 22:13:55.953357", "step": 2751, "epoch": 3 }, { "type": "best_pplx", "content": 27226157.57321593, "timestamp": "2025-09-30 22:13:55.955439", "step": 2751, "epoch": 3 }, { "type": "best_step", "content": 2679, "timestamp": "2025-09-30 22:13:55.957537", "step": 2751, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5062218940038400, "timestamp": "2025-09-30 22:13:55.959569", "step": 2751, "epoch": 3 }, { "type": "total_train_flops", "content": 7174123736893632.0, "timestamp": "2025-09-30 22:13:55.961809", "step": 2751, "epoch": 3 } ], "best_evals": { "pplx": { "score": 27226157.57321593, "step": 2679 }, "rougel": { "precision": 0.8357843137254902, "recall": 0.8357843137254902, "fmeasure": 0.8357843137254902 } } }